aco: Initial commit of independent AMD compiler
authorDaniel Schürmann <daniel@schuermann.dev>
Tue, 17 Sep 2019 11:22:17 +0000 (13:22 +0200)
committerDaniel Schürmann <daniel@schuermann.dev>
Thu, 19 Sep 2019 10:10:00 +0000 (12:10 +0200)
ACO (short for AMD Compiler) is a new compiler backend with the goal to replace
LLVM for Radeon hardware for the RADV driver.

ACO currently supports only VS, PS and CS on VI and Vega.
There are some optimizations missing because of unmerged NIR changes
which may decrease performance.

Full commit history can be found at
https://github.com/daniel-schuermann/mesa/commits/backend

Co-authored-by: Daniel Schürmann <daniel@schuermann.dev>
Co-authored-by: Rhys Perry <pendingchaos02@gmail.com>
Co-authored-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Co-authored-by: Connor Abbott <cwabbott0@gmail.com>
Co-authored-by: Michael Schellenberger Costa <mschellenbergercosta@googlemail.com>
Co-authored-by: Timur Kristóf <timur.kristof@gmail.com>
Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
31 files changed:
src/amd/compiler/README [new file with mode: 0644]
src/amd/compiler/aco_assembler.cpp [new file with mode: 0644]
src/amd/compiler/aco_builder_h.py [new file with mode: 0644]
src/amd/compiler/aco_dead_code_analysis.cpp [new file with mode: 0644]
src/amd/compiler/aco_dominance.cpp [new file with mode: 0644]
src/amd/compiler/aco_insert_NOPs.cpp [new file with mode: 0644]
src/amd/compiler/aco_insert_exec_mask.cpp [new file with mode: 0644]
src/amd/compiler/aco_insert_waitcnt.cpp [new file with mode: 0644]
src/amd/compiler/aco_instruction_selection.cpp [new file with mode: 0644]
src/amd/compiler/aco_instruction_selection_setup.cpp [new file with mode: 0644]
src/amd/compiler/aco_interface.cpp [new file with mode: 0644]
src/amd/compiler/aco_interface.h [new file with mode: 0644]
src/amd/compiler/aco_ir.h [new file with mode: 0644]
src/amd/compiler/aco_live_var_analysis.cpp [new file with mode: 0644]
src/amd/compiler/aco_lower_bool_phis.cpp [new file with mode: 0644]
src/amd/compiler/aco_lower_to_hw_instr.cpp [new file with mode: 0644]
src/amd/compiler/aco_opcodes.py [new file with mode: 0644]
src/amd/compiler/aco_opcodes_cpp.py [new file with mode: 0644]
src/amd/compiler/aco_opcodes_h.py [new file with mode: 0644]
src/amd/compiler/aco_opt_value_numbering.cpp [new file with mode: 0644]
src/amd/compiler/aco_optimizer.cpp [new file with mode: 0644]
src/amd/compiler/aco_print_asm.cpp [new file with mode: 0644]
src/amd/compiler/aco_print_ir.cpp [new file with mode: 0644]
src/amd/compiler/aco_reduce_assign.cpp [new file with mode: 0644]
src/amd/compiler/aco_register_allocation.cpp [new file with mode: 0644]
src/amd/compiler/aco_scheduler.cpp [new file with mode: 0644]
src/amd/compiler/aco_spill.cpp [new file with mode: 0644]
src/amd/compiler/aco_ssa_elimination.cpp [new file with mode: 0644]
src/amd/compiler/aco_util.h [new file with mode: 0644]
src/amd/compiler/aco_validate.cpp [new file with mode: 0644]
src/amd/compiler/meson.build [new file with mode: 0644]

diff --git a/src/amd/compiler/README b/src/amd/compiler/README
new file mode 100644 (file)
index 0000000..87d63c0
--- /dev/null
@@ -0,0 +1,87 @@
+# Unofficial GCN/RDNA ISA reference errata
+
+## v_sad_u32
+
+The Vega ISA reference writes it's behaviour as:
+```
+D.u = abs(S0.i - S1.i) + S2.u.
+```
+This is incorrect. The actual behaviour is what is written in the GCN3 reference
+guide:
+```
+ABS_DIFF (A,B) = (A>B) ? (A-B) : (B-A)
+D.u = ABS_DIFF (S0.u,S1.u) + S2.u
+```
+The instruction doesn't subtract the S0 and S1 and use the absolute value (the
+_signed_ distance), it uses the _unsigned_ distance between the operands. So
+`v_sad_u32(-5, 0, 0)` would return `4294967291` (`-5` interpreted as unsigned),
+not `5`.
+
+## s_bfe_*
+
+Both the Vega and GCN3 ISA references write that these instructions don't write
+SCC. They do.
+
+## v_bcnt_u32_b32
+
+The Vega ISA reference writes it's behaviour as:
+```
+D.u = 0;
+for i in 0 ... 31 do
+D.u += (S0.u[i] == 1 ? 1 : 0);
+endfor.
+```
+This is incorrect. The actual behaviour (and number of operands) is what
+is written in the GCN3 reference guide:
+```
+D.u = CountOneBits(S0.u) + S1.u.
+```
+
+## SMEM stores
+
+The Vega ISA references doesn't say this (or doesn't make it clear), but
+the offset for SMEM stores must be in m0 if IMM == 0.
+
+The RDNA ISA doesn't mention SMEM stores at all, but they seem to be supported
+by the chip and are present in LLVM. AMD devs however highly recommend avoiding
+these instructions.
+
+## SMEM atomics
+
+RDNA ISA: same as the SMEM stores, the ISA pretends they don't exist, but they
+are there in LLVM.
+
+## VMEM stores
+
+All reference guides say (under "Vector Memory Instruction Data Dependencies"):
+> When a VM instruction is issued, the address is immediately read out of VGPRs
+> and sent to the texture cache. Any texture or buffer resources and samplers
+> are also sent immediately. However, write-data is not immediately sent to the
+> texture cache.
+Reading that, one might think that waitcnts need to be added when writing to
+the registers used for a VMEM store's data. Experimentation has shown that this
+does not seem to be the case on GFX8 and GFX9 (GFX6 and GFX7 are untested). It
+also seems unlikely, since NOPs are apparently needed in a subset of these
+situations.
+
+## MIMG opcodes on GFX8/GCN3
+
+The `image_atomic_{swap,cmpswap,add,sub}` opcodes in the GCN3 ISA reference
+guide are incorrect. The Vega ISA reference guide has the correct ones.
+
+## Legacy instructions
+
+Some instructions have a `_LEGACY` variant which implements "DX9 rules", in which
+the zero "wins" in multiplications, ie. `0.0*x` is always `0.0`. The VEGA ISA
+mentions `V_MAC_LEGACY_F32` but this instruction is not really there on VEGA.
+
+# Hardware Bugs
+
+## SMEM corrupts VCCZ on SI/CI
+
+https://github.com/llvm/llvm-project/blob/acb089e12ae48b82c0b05c42326196a030df9b82/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp#L580-L616
+After issuing a SMEM instructions, we need to wait for the SMEM instructions to
+finish and then write to vcc (for example, `s_mov_b64 vcc, vcc`) to correct vccz
+
+Currently, we don't do this.
+
diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
new file mode 100644 (file)
index 0000000..a6bf2a3
--- /dev/null
@@ -0,0 +1,497 @@
+#include <map>
+
+#include "aco_ir.h"
+#include "common/sid.h"
+
+namespace aco {
+
+struct asm_context {
+   Program *program;
+   enum chip_class chip_class;
+   std::map<int, SOPP_instruction*> branches;
+   std::vector<unsigned> constaddrs;
+   const int16_t* opcode;
+   // TODO: keep track of branch instructions referring blocks
+   // and, when emitting the block, correct the offset in instr
+   asm_context(Program* program) : program(program), chip_class(program->chip_class) {
+      if (chip_class <= GFX9)
+         opcode = &instr_info.opcode_gfx9[0];
+   }
+};
+
+void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
+{
+   uint32_t instr_offset = out.size() * 4u;
+
+   /* lower remaining pseudo-instructions */
+   if (instr->opcode == aco_opcode::p_constaddr) {
+      unsigned dest = instr->definitions[0].physReg();
+      unsigned offset = instr->operands[0].constantValue();
+
+      /* s_getpc_b64 dest[0:1] */
+      uint32_t encoding = (0b101111101 << 23);
+      uint32_t opcode = ctx.opcode[(int)aco_opcode::s_getpc_b64];
+      if (opcode >= 55 && ctx.chip_class <= GFX9) {
+         assert(ctx.chip_class == GFX9 && opcode < 60);
+         opcode = opcode - 4;
+      }
+      encoding |= dest << 16;
+      encoding |= opcode << 8;
+      out.push_back(encoding);
+
+      /* s_add_u32 dest[0], dest[0], ... */
+      encoding = (0b10 << 30);
+      encoding |= ctx.opcode[(int)aco_opcode::s_add_u32] << 23;
+      encoding |= dest << 16;
+      encoding |= dest;
+      encoding |= 255 << 8;
+      out.push_back(encoding);
+      ctx.constaddrs.push_back(out.size());
+      out.push_back(-(instr_offset + 4) + offset);
+
+      /* s_addc_u32 dest[1], dest[1], 0 */
+      encoding = (0b10 << 30);
+      encoding |= ctx.opcode[(int)aco_opcode::s_addc_u32] << 23;
+      encoding |= (dest + 1) << 16;
+      encoding |= dest + 1;
+      encoding |= 128 << 8;
+      out.push_back(encoding);
+      return;
+   }
+
+   uint32_t opcode = ctx.opcode[(int)instr->opcode];
+   if (opcode == (uint32_t)-1) {
+      fprintf(stderr, "Unsupported opcode: ");
+      aco_print_instr(instr, stderr);
+      abort();
+   }
+
+   switch (instr->format) {
+   case Format::SOP2: {
+      uint32_t encoding = (0b10 << 30);
+      encoding |= opcode << 23;
+      encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
+      encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0;
+      encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::SOPK: {
+      uint32_t encoding = (0b1011 << 28);
+      encoding |= opcode << 23;
+      encoding |=
+         !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ?
+         instr->definitions[0].physReg() << 16 :
+         !instr->operands.empty() && !(instr->operands[0].physReg() == scc) ?
+         instr->operands[0].physReg() << 16 : 0;
+      encoding |= static_cast<SOPK_instruction*>(instr)->imm;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::SOP1: {
+      uint32_t encoding = (0b101111101 << 23);
+      if (opcode >= 55 && ctx.chip_class <= GFX9) {
+         assert(ctx.chip_class == GFX9 && opcode < 60);
+         opcode = opcode - 4;
+      }
+      encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
+      encoding |= opcode << 8;
+      encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::SOPC: {
+      uint32_t encoding = (0b101111110 << 23);
+      encoding |= opcode << 16;
+      encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0;
+      encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::SOPP: {
+      SOPP_instruction* sopp = static_cast<SOPP_instruction*>(instr);
+      uint32_t encoding = (0b101111111 << 23);
+      encoding |= opcode << 16;
+      encoding |= (uint16_t) sopp->imm;
+      if (sopp->block != -1)
+         ctx.branches.insert({out.size(), sopp});
+      out.push_back(encoding);
+      break;
+   }
+   case Format::SMEM: {
+      SMEM_instruction* smem = static_cast<SMEM_instruction*>(instr);
+      uint32_t encoding = (0b110000 << 26);
+      encoding |= opcode << 18;
+      if (instr->operands.size() >= 2)
+         encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0;
+      bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
+      assert(!soe || ctx.chip_class >= GFX9);
+      encoding |= soe ? 1 << 14 : 0;
+      encoding |= smem->glc ? 1 << 16 : 0;
+      if (!instr->definitions.empty() || instr->operands.size() >= 3)
+         encoding |= (!instr->definitions.empty() ? instr->definitions[0].physReg() : instr->operands[2].physReg().reg) << 6;
+      if (instr->operands.size() >= 1)
+         encoding |= instr->operands[0].physReg() >> 1;
+      out.push_back(encoding);
+      encoding = 0;
+      if (instr->operands.size() >= 2)
+         encoding |= instr->operands[1].isConstant() ? instr->operands[1].constantValue() : instr->operands[1].physReg().reg;
+      encoding |= soe ? instr->operands.back().physReg() << 25 : 0;
+      out.push_back(encoding);
+      return;
+   }
+   case Format::VOP2: {
+      uint32_t encoding = 0;
+      encoding |= opcode << 25;
+      encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17;
+      encoding |= (0xFF & instr->operands[1].physReg().reg) << 9;
+      encoding |= instr->operands[0].physReg().reg;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::VOP1: {
+      uint32_t encoding = (0b0111111 << 25);
+      encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17;
+      encoding |= opcode << 9;
+      encoding |= instr->operands[0].physReg().reg;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::VOPC: {
+      uint32_t encoding = (0b0111110 << 25);
+      encoding |= opcode << 17;
+      encoding |= (0xFF & instr->operands[1].physReg().reg) << 9;
+      encoding |= instr->operands[0].physReg().reg;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::VINTRP: {
+      Interp_instruction* interp = static_cast<Interp_instruction*>(instr);
+      uint32_t encoding = (0b110101 << 26);
+      encoding |= (0xFF & instr->definitions[0].physReg().reg) << 18;
+      encoding |= opcode << 16;
+      encoding |= interp->attribute << 10;
+      encoding |= interp->component << 8;
+      if (instr->opcode == aco_opcode::v_interp_mov_f32)
+         encoding |= (0x3 & instr->operands[0].constantValue());
+      else
+         encoding |= (0xFF & instr->operands[0].physReg().reg);
+      out.push_back(encoding);
+      break;
+   }
+   case Format::DS: {
+      DS_instruction* ds = static_cast<DS_instruction*>(instr);
+      uint32_t encoding = (0b110110 << 26);
+      encoding |= opcode << 17;
+      encoding |= (ds->gds ? 1 : 0) << 16;
+      encoding |= ((0xFF & ds->offset1) << 8);
+      encoding |= (0xFFFF & ds->offset0);
+      out.push_back(encoding);
+      encoding = 0;
+      unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
+      encoding |= (0xFF & reg) << 24;
+      reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)  ? instr->operands[2].physReg() : 0;
+      encoding |= (0xFF & reg) << 16;
+      reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0;
+      encoding |= (0xFF & reg) << 8;
+      encoding |= (0xFF & instr->operands[0].physReg().reg);
+      out.push_back(encoding);
+      break;
+   }
+   case Format::MUBUF: {
+      MUBUF_instruction* mubuf = static_cast<MUBUF_instruction*>(instr);
+      uint32_t encoding = (0b111000 << 26);
+      encoding |= opcode << 18;
+      encoding |= (mubuf->slc ? 1 : 0) << 17;
+      encoding |= (mubuf->lds ? 1 : 0) << 16;
+      encoding |= (mubuf->glc ? 1 : 0) << 14;
+      encoding |= (mubuf->idxen ? 1 : 0) << 13;
+      encoding |= (mubuf->offen ? 1 : 0) << 12;
+      encoding |= 0x0FFF & mubuf->offset;
+      out.push_back(encoding);
+      encoding = 0;
+      encoding |= instr->operands[2].physReg() << 24;
+      encoding |= (mubuf->tfe ? 1 : 0) << 23;
+      encoding |= (instr->operands[1].physReg() >> 2) << 16;
+      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg().reg;
+      encoding |= (0xFF & reg) << 8;
+      encoding |= (0xFF & instr->operands[0].physReg().reg);
+      out.push_back(encoding);
+      break;
+   }
+   case Format::MTBUF: {
+      MTBUF_instruction* mtbuf = static_cast<MTBUF_instruction*>(instr);
+      uint32_t encoding = (0b111010 << 26);
+      encoding |= opcode << 15;
+      encoding |= (mtbuf->glc ? 1 : 0) << 14;
+      encoding |= (mtbuf->idxen ? 1 : 0) << 13;
+      encoding |= (mtbuf->offen ? 1 : 0) << 12;
+      encoding |= 0x0FFF & mtbuf->offset;
+      encoding |= (0xF & mtbuf->dfmt) << 19;
+      encoding |= (0x7 & mtbuf->nfmt) << 23;
+      out.push_back(encoding);
+      encoding = 0;
+      encoding |= instr->operands[2].physReg().reg << 24;
+      encoding |= (mtbuf->tfe ? 1 : 0) << 23;
+      encoding |= (mtbuf->slc ? 1 : 0) << 22;
+      encoding |= (instr->operands[1].physReg().reg >> 2) << 16;
+      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg().reg : instr->definitions[0].physReg().reg;
+      encoding |= (0xFF & reg) << 8;
+      encoding |= (0xFF & instr->operands[0].physReg().reg);
+      out.push_back(encoding);
+      break;
+   }
+   case Format::MIMG: {
+      MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr);
+      uint32_t encoding = (0b111100 << 26);
+      encoding |= mimg->slc ? 1 << 25 : 0;
+      encoding |= opcode << 18;
+      encoding |= mimg->lwe ? 1 << 17 : 0;
+      encoding |= mimg->tfe ? 1 << 16 : 0;
+      encoding |= mimg->r128 ? 1 << 15 : 0;
+      encoding |= mimg->da ? 1 << 14 : 0;
+      encoding |= mimg->glc ? 1 << 13 : 0;
+      encoding |= mimg->unrm ? 1 << 12 : 0;
+      encoding |= (0xF & mimg->dmask) << 8;
+      out.push_back(encoding);
+      encoding = (0xFF & instr->operands[0].physReg().reg); /* VADDR */
+      if (!instr->definitions.empty()) {
+         encoding |= (0xFF & instr->definitions[0].physReg().reg) << 8; /* VDATA */
+      } else if (instr->operands.size() == 4) {
+         encoding |= (0xFF & instr->operands[3].physReg().reg) << 8; /* VDATA */
+      }
+      encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 16; /* T# (resource) */
+      if (instr->operands.size() > 2)
+         encoding |= (0x1F & (instr->operands[2].physReg() >> 2)) << 21; /* sampler */
+      // TODO VEGA: D16
+      out.push_back(encoding);
+      break;
+   }
+   case Format::FLAT:
+   case Format::SCRATCH:
+   case Format::GLOBAL: {
+      FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
+      uint32_t encoding = (0b110111 << 26);
+      encoding |= opcode << 18;
+      encoding |= flat->offset & 0x1fff;
+      if (instr->format == Format::SCRATCH)
+         encoding |= 1 << 14;
+      else if (instr->format == Format::GLOBAL)
+         encoding |= 2 << 14;
+      encoding |= flat->lds ? 1 << 13 : 0;
+      encoding |= flat->glc ? 1 << 13 : 0;
+      encoding |= flat->slc ? 1 << 13 : 0;
+      out.push_back(encoding);
+      encoding = (0xFF & instr->operands[0].physReg().reg);
+      if (!instr->definitions.empty())
+         encoding |= (0xFF & instr->definitions[0].physReg().reg) << 24;
+      else
+         encoding |= (0xFF & instr->operands[2].physReg().reg) << 8;
+      if (!instr->operands[1].isUndefined()) {
+         assert(instr->operands[1].physReg() != 0x7f);
+         assert(instr->format != Format::FLAT);
+         encoding |= instr->operands[1].physReg() << 16;
+      } else if (instr->format != Format::FLAT) {
+         encoding |= 0x7F << 16;
+      }
+      encoding |= flat->nv ? 1 << 23 : 0;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::EXP: {
+      Export_instruction* exp = static_cast<Export_instruction*>(instr);
+      uint32_t encoding = (0b110001 << 26);
+      encoding |= exp->valid_mask ? 0b1 << 12 : 0;
+      encoding |= exp->done ? 0b1 << 11 : 0;
+      encoding |= exp->compressed ? 0b1 << 10 : 0;
+      encoding |= exp->dest << 4;
+      encoding |= exp->enabled_mask;
+      out.push_back(encoding);
+      encoding = 0xFF & exp->operands[0].physReg().reg;
+      encoding |= (0xFF & exp->operands[1].physReg().reg) << 8;
+      encoding |= (0xFF & exp->operands[2].physReg().reg) << 16;
+      encoding |= (0xFF & exp->operands[3].physReg().reg) << 24;
+      out.push_back(encoding);
+      break;
+   }
+   case Format::PSEUDO:
+   case Format::PSEUDO_BARRIER:
+      unreachable("Pseudo instructions should be lowered before assembly.");
+   default:
+      if ((uint16_t) instr->format & (uint16_t) Format::VOP3A) {
+         VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr);
+
+         if ((uint16_t) instr->format & (uint16_t) Format::VOP2)
+            opcode = opcode + 0x100;
+         else if ((uint16_t) instr->format & (uint16_t) Format::VOP1)
+            opcode = opcode + 0x140;
+         else if ((uint16_t) instr->format & (uint16_t) Format::VOPC)
+            opcode = opcode + 0x0;
+         else if ((uint16_t) instr->format & (uint16_t) Format::VINTRP)
+            opcode = opcode + 0x270;
+
+         // TODO: op_sel
+         uint32_t encoding = (0b110100 << 26);
+         encoding |= opcode << 16;
+         encoding |= (vop3->clamp ? 1 : 0) << 15;
+         for (unsigned i = 0; i < 3; i++)
+            encoding |= vop3->abs[i] << (8+i);
+         if (instr->definitions.size() == 2)
+            encoding |= instr->definitions[1].physReg() << 8;
+         encoding |= (0xFF & instr->definitions[0].physReg().reg);
+         out.push_back(encoding);
+         encoding = 0;
+         if (instr->opcode == aco_opcode::v_interp_mov_f32) {
+            encoding = 0x3 & instr->operands[0].constantValue();
+         } else {
+            for (unsigned i = 0; i < instr->operands.size(); i++)
+               encoding |= instr->operands[i].physReg() << (i * 9);
+         }
+         encoding |= vop3->omod << 27;
+         for (unsigned i = 0; i < 3; i++)
+            encoding |= vop3->neg[i] << (29+i);
+         out.push_back(encoding);
+         return;
+
+      } else if (instr->isDPP()){
+         /* first emit the instruction without the DPP operand */
+         Operand dpp_op = instr->operands[0];
+         instr->operands[0] = Operand(PhysReg{250}, v1);
+         instr->format = (Format) ((uint32_t) instr->format & ~(1 << 14));
+         emit_instruction(ctx, out, instr);
+         DPP_instruction* dpp = static_cast<DPP_instruction*>(instr);
+         uint32_t encoding = (0xF & dpp->row_mask) << 28;
+         encoding |= (0xF & dpp->bank_mask) << 24;
+         encoding |= dpp->abs[1] << 23;
+         encoding |= dpp->neg[1] << 22;
+         encoding |= dpp->abs[0] << 21;
+         encoding |= dpp->neg[0] << 20;
+         encoding |= dpp->bound_ctrl << 19;
+         encoding |= dpp->dpp_ctrl << 8;
+         encoding |= (0xFF) & dpp_op.physReg().reg;
+         out.push_back(encoding);
+         return;
+      } else {
+         unreachable("unimplemented instruction format");
+      }
+   }
+
+   /* append literal dword */
+   for (const Operand& op : instr->operands) {
+      if (op.isLiteral()) {
+         out.push_back(op.constantValue());
+         break;
+      }
+   }
+}
+
+void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
+{
+   for (aco_ptr<Instruction>& instr : block.instructions) {
+#if 0
+      int start_idx = out.size();
+      std::cerr << "Encoding:\t" << std::endl;
+      aco_print_instr(&*instr, stderr);
+      std::cerr << std::endl;
+#endif
+      emit_instruction(ctx, out, instr.get());
+#if 0
+      for (int i = start_idx; i < out.size(); i++)
+         std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl;
+#endif
+   }
+}
+
+void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
+{
+   for (int idx = program->blocks.size() - 1; idx >= 0; idx--) {
+      Block& block = program->blocks[idx];
+      std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
+      bool endBlock = false;
+      bool exported = false;
+      while ( it != block.instructions.rend())
+      {
+         if ((*it)->format == Format::EXP && endBlock) {
+            Export_instruction* exp = static_cast<Export_instruction*>((*it).get());
+            if (program->stage & hw_vs) {
+               if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) {
+                  exp->done = true;
+                  exported = true;
+                  break;
+               }
+            } else {
+               exp->done = true;
+               exp->valid_mask = true;
+               exported = true;
+               break;
+            }
+         } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec)
+            break;
+         else if ((*it)->opcode == aco_opcode::s_endpgm) {
+            if (endBlock)
+               break;
+            endBlock = true;
+         }
+         ++it;
+      }
+      if (!endBlock || exported)
+         continue;
+      /* we didn't find an Export instruction and have to insert a null export */
+      aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
+      for (unsigned i = 0; i < 4; i++)
+         exp->operands[i] = Operand(v1);
+      exp->enabled_mask = 0;
+      exp->compressed = false;
+      exp->done = true;
+      exp->valid_mask = program->stage & hw_fs;
+      if (program->stage & hw_fs)
+         exp->dest = 9; /* NULL */
+      else
+         exp->dest = V_008DFC_SQ_EXP_POS;
+      /* insert the null export 1 instruction before endpgm */
+      block.instructions.insert(block.instructions.end() - 1, std::move(exp));
+   }
+}
+
+void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
+{
+   for (std::pair<int, SOPP_instruction*> branch : ctx.branches)
+   {
+      int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
+      out[branch.first] |= (uint16_t) offset;
+   }
+}
+
+void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
+{
+   for (unsigned addr : ctx.constaddrs)
+      out[addr] += out.size() * 4u;
+}
+
+unsigned emit_program(Program* program,
+                      std::vector<uint32_t>& code)
+{
+   asm_context ctx(program);
+
+   if (program->stage & (hw_vs | hw_fs))
+      fix_exports(ctx, code, program);
+
+   for (Block& block : program->blocks) {
+      block.offset = code.size();
+      emit_block(ctx, code, block);
+   }
+
+   fix_branches(ctx, code);
+   fix_constaddrs(ctx, code);
+
+   unsigned constant_data_offset = code.size() * sizeof(uint32_t);
+   while (program->constant_data.size() % 4u)
+      program->constant_data.push_back(0);
+   /* Copy constant data */
+   code.insert(code.end(), (uint32_t*)program->constant_data.data(),
+               (uint32_t*)(program->constant_data.data() + program->constant_data.size()));
+
+   return constant_data_offset;
+}
+
+}
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
new file mode 100644 (file)
index 0000000..7be3a66
--- /dev/null
@@ -0,0 +1,400 @@
+
+template = """\
+/*
+ * Copyright (c) 2019 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * This file was generated by aco_builder_h.py
+ */
+
+#ifndef _ACO_BUILDER_
+#define _ACO_BUILDER_
+
+#include "aco_ir.h"
+#include "util/u_math.h"
+#include "util/bitscan.h"
+
+namespace aco {
+enum dpp_ctrl {
+    _dpp_quad_perm = 0x000,
+    _dpp_row_sl = 0x100,
+    _dpp_row_sr = 0x110,
+    _dpp_row_rr = 0x120,
+    dpp_wf_sl1 = 0x130,
+    dpp_wf_rl1 = 0x134,
+    dpp_wf_sr1 = 0x138,
+    dpp_wf_rr1 = 0x13C,
+    dpp_row_mirror = 0x140,
+    dpp_row_half_mirror = 0x141,
+    dpp_row_bcast15 = 0x142,
+    dpp_row_bcast31 = 0x143
+};
+
+inline dpp_ctrl
+dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+    assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
+    return (dpp_ctrl)(lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6));
+}
+
+inline dpp_ctrl
+dpp_row_sl(unsigned amount)
+{
+    assert(amount > 0 && amount < 16);
+    return (dpp_ctrl)(((unsigned) _dpp_row_sl) | amount);
+}
+
+inline dpp_ctrl
+dpp_row_sr(unsigned amount)
+{
+    assert(amount > 0 && amount < 16);
+    return (dpp_ctrl)(((unsigned) _dpp_row_sr) | amount);
+}
+
+inline unsigned
+ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
+{
+    assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
+    return and_mask | (or_mask << 5) | (xor_mask << 10);
+}
+
+aco_ptr<Instruction> create_s_mov(Definition dst, Operand src);
+
+class Builder {
+public:
+   struct Result {
+      Instruction *instr;
+
+      Result(Instruction *instr) : instr(instr) {}
+
+      operator Instruction *() const {
+         return instr;
+      }
+
+      operator Temp() const {
+         return instr->definitions[0].getTemp();
+      }
+
+      operator Operand() const {
+         return Operand((Temp)*this);
+      }
+
+      Definition& def(unsigned index) const {
+         return instr->definitions[index];
+      }
+
+      aco_ptr<Instruction> get_ptr() const {
+        return aco_ptr<Instruction>(instr);
+      }
+   };
+
+   struct Op {
+      Operand op;
+      Op(Temp tmp) : op(tmp) {}
+      Op(Operand op_) : op(op_) {}
+      Op(Result res) : op((Temp)res) {}
+   };
+
+   Program *program;
+   bool use_iterator;
+   union {
+   bool forwards; //when use_iterator == true
+   bool start; //when use_iterator == false
+   };
+   std::vector<aco_ptr<Instruction>> *instructions;
+   std::vector<aco_ptr<Instruction>>::iterator it;
+
+   Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), instructions(NULL) {}
+   Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), instructions(&block->instructions) {}
+   Builder(Program *pgm, std::vector<aco_ptr<Instruction>> *instrs) : program(pgm), use_iterator(false), start(false), instructions(instrs) {}
+
+   void moveEnd(Block *block) {
+      instructions = &block->instructions;
+   }
+
+   void reset() {
+      use_iterator = false;
+      start = false;
+      instructions = NULL;
+   }
+
+   void reset(Block *block) {
+      use_iterator = false;
+      start = false;
+      instructions = &block->instructions;
+   }
+
+   void reset(std::vector<aco_ptr<Instruction>> *instrs) {
+      use_iterator = false;
+      start = false;
+      instructions = instrs;
+   }
+
+   Result insert(aco_ptr<Instruction> instr) {
+      Instruction *instr_ptr = instr.get();
+      if (instructions) {
+         if (use_iterator) {
+            it = instructions->emplace(it, std::move(instr));
+            if (forwards)
+               it = std::next(it);
+         } else if (!start) {
+            instructions->emplace_back(std::move(instr));
+         } else {
+            instructions->emplace(instructions->begin(), std::move(instr));
+         }
+      }
+      return Result(instr_ptr);
+   }
+
+   Result insert(Instruction* instr) {
+      if (instructions) {
+         if (use_iterator) {
+            it = instructions->emplace(it, aco_ptr<Instruction>(instr));
+            if (forwards)
+               it = std::next(it);
+         } else if (!start) {
+            instructions->emplace_back(aco_ptr<Instruction>(instr));
+         } else {
+            instructions->emplace(instructions->begin(), aco_ptr<Instruction>(instr));
+         }
+      }
+      return Result(instr);
+   }
+
+   Temp tmp(RegClass rc) {
+      return (Temp){program->allocateId(), rc};
+   }
+
+   Temp tmp(RegType type, unsigned size) {
+      return (Temp){program->allocateId(), RegClass(type, size)};
+   }
+
+   Definition def(RegClass rc) {
+      return Definition((Temp){program->allocateId(), rc});
+   }
+
+   Definition def(RegType type, unsigned size) {
+      return Definition((Temp){program->allocateId(), RegClass(type, size)});
+   }
+
+   Definition def(RegClass rc, PhysReg reg) {
+      return Definition(program->allocateId(), reg, rc);
+   }
+
+% for fixed in ['m0', 'vcc', 'exec', 'scc']:
+   Operand ${fixed}(Temp tmp) {
+       Operand op(tmp);
+       op.setFixed(aco::${fixed});
+       return op;
+   }
+
+   Definition ${fixed}(Definition def) {
+       def.setFixed(aco::${fixed});
+       return def;
+   }
+
+   Definition hint_${fixed}(Definition def) {
+       def.setHint(aco::${fixed});
+       return def;
+   }
+
+% endfor
+   /* hand-written helpers */
+   Temp as_uniform(Op op)
+   {
+      assert(op.op.isTemp());
+      if (op.op.getTemp().type() == RegType::vgpr)
+         return pseudo(aco_opcode::p_as_uniform, def(RegType::sgpr, op.op.size()), op);
+      else
+         return op.op.getTemp();
+   }
+
+   Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false)
+   {
+      assert(tmp.type() == RegType::vgpr);
+      if (imm == 0) {
+         return vop1(aco_opcode::v_mov_b32, dst, Operand(0u));
+      } else if (imm == 1) {
+         return copy(dst, Operand(tmp));
+      } else if (util_is_power_of_two_or_zero(imm)) {
+         return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp);
+      } else if (bits24) {
+        return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp);
+      } else {
+        Temp imm_tmp = copy(def(v1), Operand(imm));
+        return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp);
+      }
+   }
+
+   Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm)
+   {
+      return v_mul_imm(dst, tmp, imm, true);
+   }
+
+   Result copy(Definition dst, Op op_) {
+      Operand op = op_.op;
+      if (dst.regClass() == s1 && op.size() == 1 && op.isLiteral()) {
+         uint32_t imm = op.constantValue();
+         if (imm >= 0xffff8000 || imm <= 0x7fff) {
+            return sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
+         } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
+            uint32_t rev = util_bitreverse(imm);
+            return dst.regClass() == v1 ?
+                   vop1(aco_opcode::v_bfrev_b32, dst, Operand(rev)) :
+                   sop1(aco_opcode::s_brev_b32, dst, Operand(rev));
+         } else if (imm != 0) {
+            unsigned start = (ffs(imm) - 1) & 0x1f;
+            unsigned size = util_bitcount(imm) & 0x1f;
+            if ((((1u << size) - 1u) << start) == imm)
+                return sop2(aco_opcode::s_bfm_b32, dst, Operand(size), Operand(start));
+         }
+      }
+
+      if (dst.regClass() == s2) {
+        return sop1(aco_opcode::s_mov_b64, dst, op);
+      } else if (op.size() > 1) {
+         return pseudo(aco_opcode::p_create_vector, dst, op);
+      } else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) {
+        return vop1(aco_opcode::v_mov_b32, dst, op);
+      } else {
+        assert(dst.regClass() == s1);
+        return sop1(aco_opcode::s_mov_b32, dst, op);
+      }
+   }
+
+   Result vadd32(Definition dst, Op a, Op b, bool carry_out=false, Op carry_in=Op(Operand(s2))) {
+      if (!b.op.isTemp() || b.op.regClass().type() != RegType::vgpr)
+         std::swap(a, b);
+      assert(b.op.isTemp() && b.op.regClass().type() == RegType::vgpr);
+
+      if (!carry_in.op.isUndefined())
+         return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(s2)), a, b, carry_in);
+      else if (program->chip_class < GFX9 || carry_out)
+         return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(s2)), a, b);
+      else
+         return vop2(aco_opcode::v_add_u32, Definition(dst), a, b);
+   }
+
+   Result vsub32(Definition dst, Op a, Op b, bool carry_out=false, Op borrow=Op(Operand(s2)))
+   {
+      if (!borrow.op.isUndefined() || program->chip_class < GFX9)
+         carry_out = true;
+
+      bool reverse = !b.op.isTemp() || b.op.regClass().type() != RegType::vgpr;
+      if (reverse)
+         std::swap(a, b);
+      assert(b.op.isTemp() && b.op.regClass().type() == RegType::vgpr);
+
+      aco_opcode op;
+      Temp carry;
+      if (carry_out) {
+         carry = tmp(s2);
+         if (borrow.op.isUndefined())
+            op = reverse ? aco_opcode::v_subrev_co_u32 : aco_opcode::v_sub_co_u32;
+         else
+            op = reverse ? aco_opcode::v_subbrev_co_u32 : aco_opcode::v_subb_co_u32;
+      } else {
+         op = reverse ? aco_opcode::v_subrev_u32 : aco_opcode::v_sub_u32;
+      }
+
+      int num_ops = borrow.op.isUndefined() ? 2 : 3;
+      int num_defs = carry_out ? 2 : 1;
+      aco_ptr<Instruction> sub{create_instruction<VOP2_instruction>(op, Format::VOP2, num_ops, num_defs)};
+      sub->operands[0] = a.op;
+      sub->operands[1] = b.op;
+      if (!borrow.op.isUndefined())
+         sub->operands[2] = borrow.op;
+      sub->definitions[0] = dst;
+      if (carry_out) {
+         sub->definitions[1] = Definition(carry);
+         sub->definitions[1].setHint(aco::vcc);
+      }
+      return insert(std::move(sub));
+   }
+<%
+import itertools
+formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]),
+           ("sop1", [Format.SOP1], 'SOP1_instruction', [(1, 1), (2, 1), (3, 2)]),
+           ("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])),
+           ("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])),
+           ("sopp", [Format.SOPP], 'SOPP_instruction', [(0, 0), (0, 1)]),
+           ("sopc", [Format.SOPC], 'SOPC_instruction', [(1, 2)]),
+           ("smem", [Format.SMEM], 'SMEM_instruction', [(0, 4), (0, 3), (1, 0), (1, 3), (1, 2), (0, 0)]),
+           ("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]),
+           ("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]),
+           ("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 4), (1, 3), (0, 3), (1, 2)]), #TODO(pendingchaos): less shapes?
+           ("exp", [Format.EXP], 'Export_instruction', [(0, 4)]),
+           ("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])),
+           ("barrier", [Format.PSEUDO_BARRIER], 'Pseudo_barrier_instruction', [(0, 0)]),
+           ("reduction", [Format.PSEUDO_REDUCTION], 'Pseudo_reduction_instruction', [(3, 2)]),
+           ("vop1", [Format.VOP1], 'VOP1_instruction', [(1, 1), (2, 2)]),
+           ("vop2", [Format.VOP2], 'VOP2_instruction', itertools.product([1, 2], [2, 3])),
+           ("vopc", [Format.VOPC], 'VOPC_instruction', itertools.product([1, 2], [2])),
+           ("vop3", [Format.VOP3A], 'VOP3A_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]),
+           ("vintrp", [Format.VINTRP], 'Interp_instruction', [(1, 2), (1, 3)]),
+           ("vop1_dpp", [Format.VOP1, Format.DPP], 'DPP_instruction', [(1, 1)]),
+           ("vop2_dpp", [Format.VOP2, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2, 3])),
+           ("vopc_dpp", [Format.VOPC, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2])),
+           ("vop1_e64", [Format.VOP1, Format.VOP3A], 'VOP3A_instruction', itertools.product([1], [1])),
+           ("vop2_e64", [Format.VOP2, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2, 3])),
+           ("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])),
+           ("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]),
+           ("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])]
+%>\\
+% for name, formats, struct, shapes in formats:
+    % for num_definitions, num_operands in shapes:
+        <%
+        args = ['aco_opcode opcode']
+        for i in range(num_definitions):
+            args.append('Definition def%d' % i)
+        for i in range(num_operands):
+            args.append('Op op%d' % i)
+        for f in formats:
+            args += f.get_builder_field_decls()
+        %>\\
+
+   Result ${name}(${', '.join(args)})
+   {
+      ${struct} *instr = create_instruction<${struct}>(opcode, (Format)(${'|'.join('(int)Format::%s' % f.name for f in formats)}), ${num_operands}, ${num_definitions});
+        % for i in range(num_definitions):
+            instr->definitions[${i}] = def${i};
+        % endfor
+        % for i in range(num_operands):
+            instr->operands[${i}] = op${i}.op;
+        % endfor
+        % for f in formats:
+            % for dest, field_name in zip(f.get_builder_field_dests(), f.get_builder_field_names()):
+      instr->${dest} = ${field_name};
+            % endfor
+        % endfor
+      return insert(instr);
+   }
+    % endfor
+% endfor
+};
+
+}
+#endif /* _ACO_BUILDER_ */"""
+
+from aco_opcodes import opcodes, Format
+from mako.template import Template
+
+print(Template(template).render(opcodes=opcodes, Format=Format))
diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp
new file mode 100644 (file)
index 0000000..f56718f
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2019 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "aco_ir.h"
+
+#include <algorithm>
+
+/*
+ * Implements an analysis pass to determine the number of uses
+ * for each SSA-definition.
+ */
+
+namespace aco {
+namespace {
+
+struct dce_ctx {
+   int current_block;
+   std::vector<uint16_t> uses;
+   std::vector<std::vector<bool>> live;
+
+   dce_ctx(Program* program) : current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
+   {
+      live.reserve(program->blocks.size());
+      for (Block& block : program->blocks)
+         live.emplace_back(block.instructions.size());
+   }
+};
+
+void process_block(dce_ctx& ctx, Block& block)
+{
+   std::vector<bool>& live = ctx.live[block.index];
+   assert(live.size() == block.instructions.size());
+   bool process_predecessors = false;
+   for (int idx = block.instructions.size() - 1; idx >= 0; idx--) {
+      if (live[idx])
+         continue;
+
+      aco_ptr<Instruction>& instr = block.instructions[idx];
+      const bool is_live = instr->definitions.empty() ||
+                           std::any_of(instr->definitions.begin(), instr->definitions.end(),
+                              [&ctx] (const Definition& def) { return !def.isTemp() || ctx.uses[def.tempId()];});
+
+      if (is_live) {
+         for (const Operand& op : instr->operands) {
+            if (op.isTemp()) {
+               if (ctx.uses[op.tempId()] == 0)
+                  process_predecessors = true;
+               ctx.uses[op.tempId()]++;
+            }
+         }
+         live[idx] = true;
+      }
+   }
+
+   if (process_predecessors) {
+      for (unsigned pred_idx : block.linear_preds)
+         ctx.current_block = std::max(ctx.current_block, (int) pred_idx);
+   }
+}
+
+} /* end namespace */
+
+std::vector<uint16_t> dead_code_analysis(Program *program) {
+
+   dce_ctx ctx(program);
+
+   while (ctx.current_block >= 0) {
+      unsigned next_block = ctx.current_block--;
+      process_block(ctx, program->blocks[next_block]);
+   }
+
+   /* add one use to exec to prevent startpgm from being removed */
+   aco_ptr<Instruction>& startpgm = program->blocks[0].instructions[0];
+   assert(startpgm->opcode == aco_opcode::p_startpgm);
+   ctx.uses[startpgm->definitions.back().tempId()]++;
+
+   return ctx.uses;
+}
+
+}
+
diff --git a/src/amd/compiler/aco_dominance.cpp b/src/amd/compiler/aco_dominance.cpp
new file mode 100644 (file)
index 0000000..de5549e
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * Copyright © 2018 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
+ *
+ */
+
+#ifndef ACO_DOMINANCE_CPP
+#define ACO_DOMINANCE_CPP
+
+#include "aco_ir.h"
+
+/*
+ * Implements the algorithms for computing the dominator tree from
+ * "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy.
+ *
+ * Different from the paper, our CFG allows to compute the dominator tree
+ * in a single pass as it is guaranteed that the dominating predecessors
+ * are processed before the current block.
+ */
+
+namespace aco {
+
+void dominator_tree(Program* program)
+{
+   program->blocks[0].logical_idom = 0;
+   program->blocks[0].linear_idom = 0;
+
+   for (unsigned i = 1; i < program->blocks.size(); i++) {
+      Block& block = program->blocks[i];
+      int new_logical_idom = -1;
+      int new_linear_idom = -1;
+      for (unsigned pred_idx : block.logical_preds) {
+         if ((int) program->blocks[pred_idx].logical_idom == -1)
+            continue;
+         
+         if (new_logical_idom == -1) {
+            new_logical_idom = pred_idx;
+            continue;
+         }
+         
+         while ((int) pred_idx != new_logical_idom) {
+            if ((int) pred_idx > new_logical_idom)
+               pred_idx = program->blocks[pred_idx].logical_idom;
+            if ((int) pred_idx < new_logical_idom)
+               new_logical_idom = program->blocks[new_logical_idom].logical_idom;
+         }
+      }
+
+      for (unsigned pred_idx : block.linear_preds) {
+         if ((int) program->blocks[pred_idx].linear_idom == -1)
+            continue;
+            
+         if (new_linear_idom == -1) {
+            new_linear_idom = pred_idx;
+            continue;
+         }
+         
+         while ((int) pred_idx != new_linear_idom) {
+            if ((int) pred_idx > new_linear_idom)
+               pred_idx = program->blocks[pred_idx].linear_idom;
+            if ((int) pred_idx < new_linear_idom)
+               new_linear_idom = program->blocks[new_linear_idom].linear_idom;
+         }
+      }
+
+      block.logical_idom = new_logical_idom;
+      block.linear_idom = new_linear_idom;
+   }
+}
+
+}
+#endif
diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
new file mode 100644 (file)
index 0000000..fea1364
--- /dev/null
@@ -0,0 +1,282 @@
+/*
+ * Copyright © 2019 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "aco_ir.h"
+
+namespace aco {
+namespace {
+
+struct NOP_ctx {
+   /* just initialize these with something less than max NOPs */
+   int VALU_wrexec = -10;
+   int VALU_wrvcc = -10;
+   int VALU_wrsgpr = -10;
+   enum chip_class chip_class;
+   unsigned vcc_physical;
+   NOP_ctx(Program* program) : chip_class(program->chip_class) {
+      vcc_physical = program->config->num_sgprs - 2;
+   }
+};
+
+bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
+{
+   if ((uint32_t) instr->format & (uint32_t) Format::VOPC)
+      return true;
+   if (instr->isVOP3() && instr->definitions.size() == 2)
+      return true;
+   if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
+      return true;
+   return false;
+}
+
+bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
+{
+   return a_reg > b_reg ?
+          (a_reg - b_reg < b_size) :
+          (b_reg - a_reg < a_size);
+}
+
+int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr,
+                       std::vector<aco_ptr<Instruction>>& old_instructions,
+                       std::vector<aco_ptr<Instruction>>& new_instructions)
+{
+   int new_idx = new_instructions.size();
+
+   // TODO: setreg / getreg / m0 writes
+   // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
+
+   /* break off from prevous SMEM clause if needed */
+   if (instr->format == Format::SMEM && ctx.chip_class >= GFX8) {
+      const bool is_store = instr->definitions.empty();
+      for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) {
+         aco_ptr<Instruction>& pred = new_instructions[pred_idx];
+         if (pred->format != Format::SMEM)
+            break;
+
+         /* Don't allow clauses with store instructions since the clause's
+          * instructions may use the same address. */
+         if (is_store || pred->definitions.empty())
+            return 1;
+
+         Definition& instr_def = instr->definitions[0];
+         Definition& pred_def = pred->definitions[0];
+
+         /* ISA reference doesn't say anything about this, but best to be safe */
+         if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size()))
+            return 1;
+
+         for (const Operand& op : pred->operands) {
+            if (op.isConstant() || !op.isFixed())
+               continue;
+            if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size()))
+               return 1;
+         }
+         for (const Operand& op : instr->operands) {
+            if (op.isConstant() || !op.isFixed())
+               continue;
+            if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size()))
+               return 1;
+         }
+      }
+   } else if (instr->isVALU() || instr->format == Format::VINTRP) {
+      int NOPs = 0;
+
+      if (instr->isDPP()) {
+         /* VALU does not forward EXEC to DPP. */
+         if (ctx.VALU_wrexec + 5 >= new_idx)
+            NOPs = 5 + ctx.VALU_wrexec - new_idx + 1;
+
+         /* VALU DPP reads VGPR written by VALU */
+         for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 2; pred_idx--) {
+            aco_ptr<Instruction>& pred = new_instructions[pred_idx];
+            if ((pred->isVALU() || pred->format == Format::VINTRP) &&
+                !pred->definitions.empty() &&
+                pred->definitions[0].physReg() == instr->operands[0].physReg()) {
+               NOPs = std::max(NOPs, 2 + pred_idx - new_idx + 1);
+               break;
+            }
+         }
+      }
+
+      /* SALU writes M0 */
+      if (instr->format == Format::VINTRP && new_idx > 0 && ctx.chip_class >= GFX9) {
+         aco_ptr<Instruction>& pred = new_instructions.back();
+         if (pred->isSALU() &&
+             !pred->definitions.empty() &&
+             pred->definitions[0].physReg() == m0)
+            NOPs = std::max(NOPs, 1);
+      }
+
+      for (const Operand& op : instr->operands) {
+         /* VALU which uses VCCZ */
+         if (op.physReg() == PhysReg{251} &&
+             ctx.VALU_wrvcc + 5 >= new_idx)
+            NOPs = std::max(NOPs, 5 + ctx.VALU_wrvcc - new_idx + 1);
+
+         /* VALU which uses EXECZ */
+         if (op.physReg() == PhysReg{252} &&
+             ctx.VALU_wrexec + 5 >= new_idx)
+            NOPs = std::max(NOPs, 5 + ctx.VALU_wrexec - new_idx + 1);
+
+         /* VALU which reads VCC as a constant */
+         if (ctx.VALU_wrvcc + 1 >= new_idx) {
+            for (unsigned k = 0; k < op.size(); k++) {
+               unsigned reg = op.physReg() + k;
+               if (reg == ctx.vcc_physical || reg == ctx.vcc_physical + 1)
+                  NOPs = std::max(NOPs, 1);
+            }
+         }
+      }
+
+      switch (instr->opcode) {
+         case aco_opcode::v_readlane_b32:
+         case aco_opcode::v_writelane_b32: {
+            if (ctx.VALU_wrsgpr + 4 < new_idx)
+               break;
+            PhysReg reg = instr->operands[1].physReg();
+            for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 4; pred_idx--) {
+               aco_ptr<Instruction>& pred = new_instructions[pred_idx];
+               if (!pred->isVALU() || !VALU_writes_sgpr(pred))
+                  continue;
+               for (const Definition& def : pred->definitions) {
+                  if (def.physReg() == reg)
+                     NOPs = std::max(NOPs, 4 + pred_idx - new_idx + 1);
+               }
+            }
+            break;
+         }
+         case aco_opcode::v_div_fmas_f32:
+         case aco_opcode::v_div_fmas_f64: {
+            if (ctx.VALU_wrvcc + 4 >= new_idx)
+               NOPs = std::max(NOPs, 4 + ctx.VALU_wrvcc - new_idx + 1);
+            break;
+         }
+         default:
+            break;
+      }
+
+      /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
+      // FIXME: handle case if the last instruction of a block without branch is such store
+      // TODO: confirm that DS instructions cannot cause WAR hazards here
+      if (new_idx > 0) {
+         aco_ptr<Instruction>& pred = new_instructions.back();
+         if (pred->isVMEM() &&
+             pred->operands.size() == 4 &&
+             pred->operands[3].size() > 2 &&
+             pred->operands[1].size() != 8 &&
+             (pred->format != Format::MUBUF || pred->operands[2].physReg() >= 102)) {
+            /* Ops that use a 256-bit T# do not need a wait state.
+             * BUFFER_STORE_* operations that use an SGPR for "offset"
+             * do not require any wait states. */
+            PhysReg wrdata = pred->operands[3].physReg();
+            unsigned size = pred->operands[3].size();
+            assert(wrdata >= 256);
+            for (const Definition& def : instr->definitions) {
+               if (regs_intersect(def.physReg(), def.size(), wrdata, size))
+                  NOPs = std::max(NOPs, 1);
+            }
+         }
+      }
+
+      if (VALU_writes_sgpr(instr)) {
+         for (const Definition& def : instr->definitions) {
+            if (def.physReg() == vcc)
+               ctx.VALU_wrvcc = NOPs ? new_idx : new_idx + 1;
+            else if (def.physReg() == exec)
+               ctx.VALU_wrexec = NOPs ? new_idx : new_idx + 1;
+            else if (def.physReg() <= 102)
+               ctx.VALU_wrsgpr = NOPs ? new_idx : new_idx + 1;
+         }
+      }
+      return NOPs;
+   } else if (instr->isVMEM() && ctx.VALU_wrsgpr + 5 >= new_idx) {
+      /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
+      for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) {
+         aco_ptr<Instruction>& pred = new_instructions[pred_idx];
+         if (!(pred->isVALU() && VALU_writes_sgpr(pred)))
+            continue;
+
+         for (const Definition& def : pred->definitions) {
+            if (def.physReg() > 102)
+               continue;
+
+            if (instr->operands.size() > 1 &&
+                regs_intersect(instr->operands[1].physReg(), instr->operands[1].size(),
+                               def.physReg(), def.size())) {
+                  return 5 + pred_idx - new_idx + 1;
+            }
+
+            if (instr->operands.size() > 2 &&
+                regs_intersect(instr->operands[2].physReg(), instr->operands[2].size(),
+                               def.physReg(), def.size())) {
+                  return 5 + pred_idx - new_idx + 1;
+            }
+         }
+      }
+   }
+
+   return 0;
+}
+
+
+void handle_block(NOP_ctx& ctx, Block& block)
+{
+   std::vector<aco_ptr<Instruction>> instructions;
+   instructions.reserve(block.instructions.size());
+   for (unsigned i = 0; i < block.instructions.size(); i++) {
+      aco_ptr<Instruction>& instr = block.instructions[i];
+      unsigned NOPs = handle_instruction(ctx, instr, block.instructions, instructions);
+      if (NOPs) {
+         // TODO: try to move the instruction down
+         /* create NOP */
+         aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
+         nop->imm = NOPs - 1;
+         nop->block = -1;
+         instructions.emplace_back(std::move(nop));
+      }
+
+      instructions.emplace_back(std::move(instr));
+   }
+
+   ctx.VALU_wrvcc -= instructions.size();
+   ctx.VALU_wrexec -= instructions.size();
+   ctx.VALU_wrsgpr -= instructions.size();
+   block.instructions = std::move(instructions);
+}
+
+} /* end namespace */
+
+
+void insert_NOPs(Program* program)
+{
+   NOP_ctx ctx(program);
+   for (Block& block : program->blocks) {
+      if (block.instructions.empty())
+         continue;
+
+      handle_block(ctx, block);
+   }
+}
+
+}
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
new file mode 100644 (file)
index 0000000..7886a4c
--- /dev/null
@@ -0,0 +1,1078 @@
+/*
+ * Copyright © 2019 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "aco_ir.h"
+#include "aco_builder.h"
+
+namespace aco {
+
+namespace {
+
+enum WQMState : uint8_t {
+   Unspecified = 0,
+   Exact = 1 << 0,
+   WQM = 1 << 1, /* with control flow applied */
+   Preserve_WQM = 1 << 2,
+   Exact_Branch = 1 << 3,
+};
+
+enum mask_type : uint8_t {
+   mask_type_global = 1 << 0,
+   mask_type_exact = 1 << 1,
+   mask_type_wqm = 1 << 2,
+   mask_type_loop = 1 << 3, /* active lanes of a loop */
+   mask_type_initial = 1 << 4, /* initially active lanes */
+};
+
+struct wqm_ctx {
+   Program* program;
+   /* state for WQM propagation */
+   std::set<unsigned> worklist;
+   std::vector<uint16_t> defined_in;
+   std::vector<bool> needs_wqm;
+   std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
+   bool loop;
+   bool wqm;
+   wqm_ctx(Program* program) : program(program),
+                               defined_in(program->peekAllocationId(), 0xFFFF),
+                               needs_wqm(program->peekAllocationId()),
+                               branch_wqm(program->blocks.size()),
+                               loop(false),
+                               wqm(false)
+   {
+      for (unsigned i = 0; i < program->blocks.size(); i++)
+         worklist.insert(i);
+   }
+};
+
+struct loop_info {
+   Block* loop_header;
+   uint16_t num_exec_masks;
+   uint8_t needs;
+   bool has_divergent_break;
+   bool has_divergent_continue;
+   bool has_discard;
+   loop_info(Block* b, uint16_t num, uint8_t needs, bool breaks, bool cont, bool discard) :
+             loop_header(b), num_exec_masks(num), needs(needs), has_divergent_break(breaks),
+             has_divergent_continue(cont), has_discard(discard) {}
+};
+
+struct block_info {
+   std::vector<std::pair<Temp, uint8_t>> exec;
+   std::vector<WQMState> instr_needs;
+   uint8_t block_needs;
+   uint8_t ever_again_needs;
+   /* more... */
+};
+
+struct exec_ctx {
+   Program *program;
+   std::vector<block_info> info;
+   std::vector<loop_info> loop;
+   bool handle_wqm = false;
+   exec_ctx(Program *program) : program(program), info(program->blocks.size()) {}
+};
+
+bool pred_by_exec_mask(aco_ptr<Instruction>& instr) {
+   if (instr->format == Format::SMEM || instr->isSALU())
+      return false;
+   if (instr->format == Format::PSEUDO_BARRIER)
+      return false;
+
+   if (instr->format == Format::PSEUDO) {
+      switch (instr->opcode) {
+      case aco_opcode::p_create_vector:
+         return instr->definitions[0].getTemp().type() == RegType::vgpr;
+      case aco_opcode::p_extract_vector:
+      case aco_opcode::p_split_vector:
+         return instr->operands[0].getTemp().type() == RegType::vgpr;
+      case aco_opcode::p_spill:
+      case aco_opcode::p_reload:
+         return false;
+      default:
+         break;
+      }
+   }
+
+   if (instr->opcode == aco_opcode::v_readlane_b32 ||
+       instr->opcode == aco_opcode::v_writelane_b32)
+      return false;
+
+   return true;
+}
+
+bool needs_exact(aco_ptr<Instruction>& instr) {
+   if (instr->format == Format::MUBUF) {
+      MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get());
+      return mubuf->disable_wqm;
+   } else if (instr->format == Format::MTBUF) {
+      MTBUF_instruction *mtbuf = static_cast<MTBUF_instruction *>(instr.get());
+      return mtbuf->disable_wqm;
+   } else if (instr->format == Format::MIMG) {
+      MIMG_instruction *mimg = static_cast<MIMG_instruction *>(instr.get());
+      return mimg->disable_wqm;
+   } else {
+      return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem;
+   }
+}
+
+void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
+{
+   if (!ctx.needs_wqm[tmp.id()]) {
+      ctx.needs_wqm[tmp.id()] = true;
+      if (ctx.defined_in[tmp.id()] != 0xFFFF)
+         ctx.worklist.insert(ctx.defined_in[tmp.id()]);
+   }
+}
+
+void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
+{
+   if (ctx.branch_wqm[block_idx])
+      return;
+
+   ctx.branch_wqm[block_idx] = true;
+   Block& block = ctx.program->blocks[block_idx];
+   aco_ptr<Instruction>& branch = block.instructions.back();
+
+   if (branch->opcode != aco_opcode::p_branch) {
+      assert(!branch->operands.empty() && branch->operands[0].isTemp());
+      set_needs_wqm(ctx, branch->operands[0].getTemp());
+   }
+
+   /* TODO: this sets more branch conditions to WQM than it needs to
+    * it should be enough to stop at the "exec mask top level" */
+   if (block.kind & block_kind_top_level)
+      return;
+
+   for (unsigned pred_idx : block.logical_preds)
+      mark_block_wqm(ctx, pred_idx);
+}
+
+void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
+{
+   block_info& info = exec_ctx.info[block->index];
+
+   std::vector<WQMState> instr_needs(block->instructions.size());
+
+   if (block->kind & block_kind_top_level) {
+      if (ctx.loop && ctx.wqm) {
+         /* mark all break conditions as WQM */
+         unsigned block_idx = block->index + 1;
+         while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level)) {
+            if (ctx.program->blocks[block_idx].kind & block_kind_break)
+               mark_block_wqm(ctx, block_idx);
+            block_idx++;
+         }
+      } else if (ctx.loop && !ctx.wqm) {
+         /* Ensure a branch never results in an exec mask with only helper
+          * invocations (which can cause a loop to repeat infinitively if it's
+          * break branches are done in exact). */
+         unsigned block_idx = block->index;
+         do {
+            if ((ctx.program->blocks[block_idx].kind & block_kind_branch))
+               exec_ctx.info[block_idx].block_needs |= Exact_Branch;
+            block_idx++;
+         } while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level));
+      }
+
+      ctx.loop = false;
+      ctx.wqm = false;
+   }
+
+   for (int i = block->instructions.size() - 1; i >= 0; --i)
+   {
+      aco_ptr<Instruction>& instr = block->instructions[i];
+
+      WQMState needs = needs_exact(instr) ? Exact : Unspecified;
+      bool propagate_wqm = instr->opcode == aco_opcode::p_wqm;
+      bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
+      bool pred_by_exec = pred_by_exec_mask(instr);
+      for (const Definition& definition : instr->definitions) {
+         if (!definition.isTemp())
+            continue;
+         const unsigned def = definition.tempId();
+         ctx.defined_in[def] = block->index;
+         if (needs == Unspecified && ctx.needs_wqm[def]) {
+            needs = pred_by_exec ? WQM : Unspecified;
+            propagate_wqm = true;
+         }
+      }
+
+      if (propagate_wqm) {
+         for (const Operand& op : instr->operands) {
+            if (op.isTemp()) {
+               set_needs_wqm(ctx, op.getTemp());
+            }
+         }
+      } else if (preserve_wqm && info.block_needs & WQM) {
+         needs = Preserve_WQM;
+      }
+
+      /* ensure the condition controlling the control flow for this phi is in WQM */
+      if (needs == WQM && instr->opcode == aco_opcode::p_phi) {
+         for (unsigned pred_idx : block->logical_preds)
+            mark_block_wqm(ctx, pred_idx);
+      }
+
+      instr_needs[i] = needs;
+      info.block_needs |= needs;
+   }
+
+   info.instr_needs = instr_needs;
+
+   /* for "if (<cond>) <wqm code>" or "while (<cond>) <wqm code>",
+    * <cond> should be computed in WQM */
+   if (info.block_needs & WQM && !(block->kind & block_kind_top_level)) {
+      for (unsigned pred_idx : block->logical_preds)
+         mark_block_wqm(ctx, pred_idx);
+      ctx.wqm = true;
+   }
+   if (block->kind & block_kind_loop_header)
+      ctx.loop = true;
+}
+
+void calculate_wqm_needs(exec_ctx& exec_ctx)
+{
+   wqm_ctx ctx(exec_ctx.program);
+
+   while (!ctx.worklist.empty()) {
+      unsigned block_index = *std::prev(ctx.worklist.end());
+      ctx.worklist.erase(std::prev(ctx.worklist.end()));
+
+      get_block_needs(ctx, exec_ctx, &exec_ctx.program->blocks[block_index]);
+   }
+
+   uint8_t ever_again_needs = 0;
+   for (int i = exec_ctx.program->blocks.size() - 1; i >= 0; i--) {
+      exec_ctx.info[i].ever_again_needs = ever_again_needs;
+      Block& block = exec_ctx.program->blocks[i];
+
+      if (block.kind & block_kind_needs_lowering)
+         exec_ctx.info[i].block_needs |= Exact;
+
+      /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
+      if ((block.kind & block_kind_discard ||
+           block.kind & block_kind_uses_discard_if) &&
+          ever_again_needs & WQM)
+         exec_ctx.info[i].block_needs |= Preserve_WQM;
+
+      ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
+      if (block.kind & block_kind_discard ||
+          block.kind & block_kind_uses_discard_if)
+         ever_again_needs |= Exact;
+
+      /* don't propagate WQM preservation further than the next top_level block */
+      if (block.kind & block_kind_top_level)
+         ever_again_needs &= ~Preserve_WQM;
+      else
+         exec_ctx.info[i].block_needs &= ~Preserve_WQM;
+   }
+   exec_ctx.handle_wqm = true;
+}
+
+void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
+{
+   if (ctx.info[idx].exec.back().second & mask_type_wqm)
+      return;
+   if (ctx.info[idx].exec.back().second & mask_type_global) {
+      Temp exec_mask = ctx.info[idx].exec.back().first;
+      exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask);
+      ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
+      return;
+   }
+   /* otherwise, the WQM mask should be one below the current mask */
+   ctx.info[idx].exec.pop_back();
+   assert(ctx.info[idx].exec.back().second & mask_type_wqm);
+   ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+                                                ctx.info[idx].exec.back().first);
+}
+
+void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
+{
+   if (ctx.info[idx].exec.back().second & mask_type_exact)
+      return;
+   if (ctx.info[idx].exec.back().second & mask_type_global) {
+      ctx.info[idx].exec.pop_back();
+      assert(ctx.info[idx].exec.back().second & mask_type_exact);
+      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+                                                   ctx.info[idx].exec.back().first);
+      return;
+   }
+   /* otherwise, we create an exact mask and push to the stack */
+   Temp wqm = ctx.info[idx].exec.back().first;
+   Temp exact = bld.tmp(s2);
+   wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+                  bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm));
+   ctx.info[idx].exec.back().first = wqm;
+   ctx.info[idx].exec.emplace_back(exact, mask_type_exact);
+}
+
+unsigned add_coupling_code(exec_ctx& ctx, Block* block,
+                           std::vector<aco_ptr<Instruction>>& instructions)
+{
+   unsigned idx = block->index;
+   Builder bld(ctx.program, &instructions);
+   std::vector<unsigned>& preds = block->linear_preds;
+
+   /* start block */
+   if (idx == 0) {
+      aco_ptr<Instruction>& startpgm = block->instructions[0];
+      assert(startpgm->opcode == aco_opcode::p_startpgm);
+      Temp exec_mask = startpgm->definitions.back().getTemp();
+      bld.insert(std::move(startpgm));
+
+      if (ctx.handle_wqm) {
+         ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial);
+         /* if this block only needs WQM, initialize already */
+         if (ctx.info[0].block_needs == WQM)
+            transition_to_WQM(ctx, bld, 0);
+      } else {
+         uint8_t mask = mask_type_global;
+         if (ctx.program->needs_wqm) {
+            exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask));
+            mask |= mask_type_wqm;
+         } else {
+            mask |= mask_type_exact;
+         }
+         ctx.info[0].exec.emplace_back(exec_mask, mask);
+      }
+
+      return 1;
+   }
+
+   /* loop entry block */
+   if (block->kind & block_kind_loop_header) {
+      assert(preds[0] == idx - 1);
+      ctx.info[idx].exec = ctx.info[idx - 1].exec;
+      loop_info& info = ctx.loop.back();
+      while (ctx.info[idx].exec.size() > info.num_exec_masks)
+         ctx.info[idx].exec.pop_back();
+
+      /* create ssa names for outer exec masks */
+      if (info.has_discard) {
+         aco_ptr<Pseudo_instruction> phi;
+         for (int i = 0; i < info.num_exec_masks - 1; i++) {
+            phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
+            phi->definitions[0] = bld.def(s2);
+            phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first);
+            ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
+         }
+      }
+
+      /* create ssa name for restore mask */
+      if (info.has_divergent_break) {
+         /* this phi might be trivial but ensures a parallelcopy on the loop header */
+         aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+         phi->definitions[0] = bld.def(s2);
+         phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
+         ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
+      }
+
+      /* create ssa name for loop active mask */
+      aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+      if (info.has_divergent_continue)
+         phi->definitions[0] = bld.def(s2);
+      else
+         phi->definitions[0] = bld.def(s2, exec);
+      phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first);
+      Temp loop_active = bld.insert(std::move(phi));
+
+      if (info.has_divergent_break) {
+         uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
+         ctx.info[idx].exec.emplace_back(loop_active, mask_type);
+      } else {
+         ctx.info[idx].exec.back().first = loop_active;
+         ctx.info[idx].exec.back().second |= mask_type_loop;
+      }
+
+      /* create a parallelcopy to move the active mask to exec */
+      unsigned i = 0;
+      if (info.has_divergent_continue) {
+         while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
+            bld.insert(std::move(block->instructions[i]));
+            i++;
+         }
+         uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
+         ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+                                                    ctx.info[idx].exec.back().first), mask_type);
+      }
+
+      return i;
+   }
+
+   /* loop exit block */
+   if (block->kind & block_kind_loop_exit) {
+      Block* header = ctx.loop.back().loop_header;
+      loop_info& info = ctx.loop.back();
+
+      for (ASSERTED unsigned pred : preds)
+         assert(ctx.info[pred].exec.size() >= info.num_exec_masks);
+
+      /* fill the loop header phis */
+      std::vector<unsigned>& header_preds = header->linear_preds;
+      int k = 0;
+      if (info.has_discard) {
+         while (k < info.num_exec_masks - 1) {
+            aco_ptr<Instruction>& phi = header->instructions[k];
+            assert(phi->opcode == aco_opcode::p_linear_phi);
+            for (unsigned i = 1; i < phi->operands.size(); i++)
+               phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[k].first);
+            k++;
+         }
+      }
+      aco_ptr<Instruction>& phi = header->instructions[k++];
+      assert(phi->opcode == aco_opcode::p_linear_phi);
+      for (unsigned i = 1; i < phi->operands.size(); i++)
+         phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
+
+      if (info.has_divergent_break) {
+         aco_ptr<Instruction>& phi = header->instructions[k];
+         assert(phi->opcode == aco_opcode::p_linear_phi);
+         for (unsigned i = 1; i < phi->operands.size(); i++)
+            phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
+      }
+
+      assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
+
+      /* create the loop exit phis if not trivial */
+      for (unsigned k = 0; k < info.num_exec_masks; k++) {
+         Temp same = ctx.info[preds[0]].exec[k].first;
+         uint8_t type = ctx.info[header_preds[0]].exec[k].second;
+         bool trivial = true;
+
+         for (unsigned i = 1; i < preds.size() && trivial; i++) {
+            if (ctx.info[preds[i]].exec[k].first != same)
+               trivial = false;
+         }
+
+         if (trivial) {
+            ctx.info[idx].exec.emplace_back(same, type);
+         } else {
+            /* create phi for loop footer */
+            aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+            phi->definitions[0] = bld.def(s2);
+            for (unsigned i = 0; i < phi->operands.size(); i++)
+               phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first);
+            ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
+         }
+      }
+      assert(ctx.info[idx].exec.size() == info.num_exec_masks);
+
+      /* create a parallelcopy to move the live mask to exec */
+      unsigned i = 0;
+      while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
+         bld.insert(std::move(block->instructions[i]));
+         i++;
+      }
+
+      if (ctx.handle_wqm) {
+         if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
+            if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
+                (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
+               ctx.info[idx].exec.back().second |= mask_type_global;
+               transition_to_Exact(ctx, bld, idx);
+               ctx.handle_wqm = false;
+            }
+         }
+         if (ctx.info[idx].block_needs == WQM)
+            transition_to_WQM(ctx, bld, idx);
+         else if (ctx.info[idx].block_needs == Exact)
+            transition_to_Exact(ctx, bld, idx);
+      }
+
+      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+                                                   ctx.info[idx].exec.back().first);
+
+      ctx.loop.pop_back();
+      return i;
+   }
+
+   if (preds.size() == 1) {
+      ctx.info[idx].exec = ctx.info[preds[0]].exec;
+   } else {
+      assert(preds.size() == 2);
+      /* if one of the predecessors ends in exact mask, we pop it from stack */
+      unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(),
+                                         ctx.info[preds[1]].exec.size());
+      if (block->kind & block_kind_top_level && !(block->kind & block_kind_merge))
+         num_exec_masks = std::min(num_exec_masks, 2u);
+
+      /* create phis for diverged exec masks */
+      for (unsigned i = 0; i < num_exec_masks; i++) {
+         bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
+         if (!in_exec && ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
+            assert(ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
+            ctx.info[idx].exec.emplace_back(ctx.info[preds[0]].exec[i]);
+            continue;
+         }
+
+         Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2),
+                               ctx.info[preds[0]].exec[i].first,
+                               ctx.info[preds[1]].exec[i].first);
+         uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
+         ctx.info[idx].exec.emplace_back(phi, mask_type);
+      }
+   }
+
+   unsigned i = 0;
+   while (block->instructions[i]->opcode == aco_opcode::p_phi ||
+          block->instructions[i]->opcode == aco_opcode::p_linear_phi) {
+      bld.insert(std::move(block->instructions[i]));
+      i++;
+   }
+
+   if (block->kind & block_kind_merge)
+      ctx.info[idx].exec.pop_back();
+
+   if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 3) {
+      assert(ctx.info[idx].exec.back().second == mask_type_exact);
+      assert(block->kind & block_kind_merge);
+      ctx.info[idx].exec.pop_back();
+   }
+
+   /* try to satisfy the block's needs */
+   if (ctx.handle_wqm) {
+      if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
+         if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
+             (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
+            ctx.info[idx].exec.back().second |= mask_type_global;
+            transition_to_Exact(ctx, bld, idx);
+            ctx.handle_wqm = false;
+         }
+      }
+      if (ctx.info[idx].block_needs == WQM)
+         transition_to_WQM(ctx, bld, idx);
+      else if (ctx.info[idx].block_needs == Exact)
+         transition_to_Exact(ctx, bld, idx);
+   }
+
+   if (block->kind & block_kind_merge) {
+      Temp restore = ctx.info[idx].exec.back().first;
+      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore);
+   }
+
+   return i;
+}
+
+void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instruction>& instr, Temp cur_exec)
+{
+   Operand offset = instr->operands[1];
+   if (need_check) {
+      /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
+      Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u));
+
+      if (offset.isLiteral())
+         offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset);
+
+      offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)),
+                        offset, Operand(UINT32_MAX), bld.scc(nonempty));
+   } else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) {
+      offset = bld.sop1(aco_opcode::s_mov_b32, bld.hint_m0(bld.def(s1)), offset);
+   }
+   if (!offset.isConstant())
+      offset.setFixed(m0);
+
+   switch (instr->operands[2].size()) {
+   case 1:
+      instr->opcode = aco_opcode::s_buffer_store_dword;
+      break;
+   case 2:
+      instr->opcode = aco_opcode::s_buffer_store_dwordx2;
+      break;
+   case 4:
+      instr->opcode = aco_opcode::s_buffer_store_dwordx4;
+      break;
+   default:
+      unreachable("Invalid SMEM buffer store size");
+   }
+   instr->operands[1] = offset;
+   /* as_uniform() needs to be done here so it's done in exact mode and helper
+    * lanes don't contribute. */
+   instr->operands[2] = Operand(bld.as_uniform(instr->operands[2]));
+}
+
+void process_instructions(exec_ctx& ctx, Block* block,
+                          std::vector<aco_ptr<Instruction>>& instructions,
+                          unsigned idx)
+{
+   WQMState state;
+   if (ctx.info[block->index].exec.back().second & mask_type_wqm)
+      state = WQM;
+   else {
+      assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact);
+      state = Exact;
+   }
+
+   /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
+   bool process = (ctx.handle_wqm &&
+                   (ctx.info[block->index].block_needs & state) !=
+                   (ctx.info[block->index].block_needs & (WQM | Exact))) ||
+                  block->kind & block_kind_uses_discard_if ||
+                  block->kind & block_kind_needs_lowering;
+   if (!process) {
+      std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
+      instructions.insert(instructions.end(),
+                          std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
+                          std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
+      return;
+   }
+
+   Builder bld(ctx.program, &instructions);
+
+   for (; idx < block->instructions.size(); idx++) {
+      aco_ptr<Instruction> instr = std::move(block->instructions[idx]);
+
+      WQMState needs = ctx.handle_wqm ? ctx.info[block->index].instr_needs[idx] : Unspecified;
+
+      if (instr->opcode == aco_opcode::p_discard_if) {
+         if (ctx.info[block->index].block_needs & Preserve_WQM) {
+            assert(block->kind & block_kind_top_level);
+            transition_to_WQM(ctx, bld, block->index);
+            ctx.info[block->index].exec.back().second &= ~mask_type_global;
+         }
+         unsigned num = ctx.info[block->index].exec.size();
+         assert(num);
+         Operand cond = instr->operands[0];
+         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1));
+         for (unsigned i = 0; i < num; i++) {
+            instr->operands[i] = Operand(ctx.info[block->index].exec[i].first);
+            if (i == num - 1)
+               instr->operands[i].setFixed(exec);
+            Temp new_mask = bld.tmp(s2);
+            instr->definitions[i] = Definition(new_mask);
+            ctx.info[block->index].exec[i].first = new_mask;
+         }
+         assert((ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
+         instr->definitions[num - 1].setFixed(exec);
+         instr->operands[num] = cond;
+         instr->definitions[num] = bld.def(s1, scc);
+
+      } else if (needs == WQM && state != WQM) {
+         transition_to_WQM(ctx, bld, block->index);
+         state = WQM;
+      } else if (needs == Exact && state != Exact) {
+         transition_to_Exact(ctx, bld, block->index);
+         state = Exact;
+      }
+
+      if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) {
+         Definition dst = instr->definitions[0];
+         if (state == Exact) {
+            instr.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b64, Format::SOP1, 1, 1));
+            instr->operands[0] = Operand(0u);
+            instr->definitions[0] = dst;
+         } else {
+            std::pair<Temp, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
+            if (instr->opcode == aco_opcode::p_load_helper &&
+                !(ctx.info[block->index].exec[0].second & mask_type_initial)) {
+               /* find last initial exact mask */
+               for (int i = block->index; i >= 0; i--) {
+                  if (ctx.program->blocks[i].kind & block_kind_top_level &&
+                      ctx.info[i].exec[0].second & mask_type_initial) {
+                     exact_mask = ctx.info[i].exec[0];
+                     break;
+                  }
+               }
+            }
+
+            assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial);
+            assert(exact_mask.second & mask_type_exact);
+
+            instr.reset(create_instruction<SOP2_instruction>(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2));
+            instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */
+            instr->operands[1] = Operand(exact_mask.first);
+            instr->definitions[0] = dst;
+            instr->definitions[1] = bld.def(s1, scc);
+         }
+      } else if (instr->opcode == aco_opcode::p_demote_to_helper) {
+         /* turn demote into discard_if with only exact masks */
+         assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global));
+         ctx.info[block->index].exec[0].second &= ~mask_type_initial;
+
+         int num = 0;
+         Temp cond;
+         if (instr->operands.empty()) {
+            /* transition to exact and set exec to zero */
+            Temp old_exec = ctx.info[block->index].exec.back().first;
+            Temp new_exec = bld.tmp(s2);
+            cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+                            bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
+            if (ctx.info[block->index].exec.back().second & mask_type_exact) {
+               ctx.info[block->index].exec.back().first = new_exec;
+            } else {
+               ctx.info[block->index].exec.back().first = cond;
+               ctx.info[block->index].exec.emplace_back(new_exec, mask_type_exact);
+            }
+         } else {
+            /* demote_if: transition to exact */
+            transition_to_Exact(ctx, bld, block->index);
+            assert(instr->operands[0].isTemp());
+            cond = instr->operands[0].getTemp();
+            num = 1;
+         }
+
+         for (unsigned i = 0; i < ctx.info[block->index].exec.size() - 1; i++)
+            num += ctx.info[block->index].exec[i].second & mask_type_exact ? 1 : 0;
+         instr.reset(create_instruction<Instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1));
+         int k = 0;
+         for (unsigned i = 0; k < num; i++) {
+            if (ctx.info[block->index].exec[i].second & mask_type_exact) {
+               instr->operands[k] = Operand(ctx.info[block->index].exec[i].first);
+               Temp new_mask = bld.tmp(s2);
+               instr->definitions[k] = Definition(new_mask);
+               if (i == ctx.info[block->index].exec.size() - 1)
+                  instr->definitions[k].setFixed(exec);
+               k++;
+               ctx.info[block->index].exec[i].first = new_mask;
+            }
+         }
+         assert(k == num);
+         instr->definitions[num] = bld.def(s1, scc);
+         instr->operands[num] = Operand(cond);
+         state = Exact;
+
+      } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
+         bool need_check = ctx.info[block->index].exec.size() != 1 &&
+                           !(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact);
+         lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first);
+      }
+
+      bld.insert(std::move(instr));
+   }
+}
+
+void add_branch_code(exec_ctx& ctx, Block* block)
+{
+   unsigned idx = block->index;
+   Builder bld(ctx.program, block);
+
+   if (idx == ctx.program->blocks.size() - 1)
+      return;
+
+   /* try to disable wqm handling */
+   if (ctx.handle_wqm && block->kind & block_kind_top_level) {
+      if (ctx.info[idx].exec.size() == 3) {
+         assert(ctx.info[idx].exec[1].second == mask_type_wqm);
+         ctx.info[idx].exec.pop_back();
+      }
+      assert(ctx.info[idx].exec.size() <= 2);
+
+      if (ctx.info[idx].ever_again_needs == 0 ||
+          ctx.info[idx].ever_again_needs == Exact) {
+         /* transition to Exact */
+         aco_ptr<Instruction> branch = std::move(block->instructions.back());
+         block->instructions.pop_back();
+         ctx.info[idx].exec.back().second |= mask_type_global;
+         transition_to_Exact(ctx, bld, idx);
+         bld.insert(std::move(branch));
+         ctx.handle_wqm = false;
+
+      } else if (ctx.info[idx].block_needs & Preserve_WQM) {
+         /* transition to WQM and remove global flag */
+         aco_ptr<Instruction> branch = std::move(block->instructions.back());
+         block->instructions.pop_back();
+         transition_to_WQM(ctx, bld, idx);
+         ctx.info[idx].exec.back().second &= ~mask_type_global;
+         bld.insert(std::move(branch));
+      }
+   }
+
+   if (block->kind & block_kind_loop_preheader) {
+      /* collect information about the succeeding loop */
+      bool has_divergent_break = false;
+      bool has_divergent_continue = false;
+      bool has_discard = false;
+      uint8_t needs = 0;
+      unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth;
+
+      for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) {
+         Block& loop_block = ctx.program->blocks[i];
+         needs |= ctx.info[i].block_needs;
+
+         if (loop_block.kind & block_kind_uses_discard_if ||
+             loop_block.kind & block_kind_discard)
+            has_discard = true;
+         if (loop_block.loop_nest_depth != loop_nest_depth)
+            continue;
+
+         if (loop_block.kind & block_kind_uniform)
+            continue;
+         else if (loop_block.kind & block_kind_break)
+            has_divergent_break = true;
+         else if (loop_block.kind & block_kind_continue)
+            has_divergent_continue = true;
+      }
+
+      if (ctx.handle_wqm) {
+         if (needs & WQM) {
+            aco_ptr<Instruction> branch = std::move(block->instructions.back());
+            block->instructions.pop_back();
+            transition_to_WQM(ctx, bld, idx);
+            bld.insert(std::move(branch));
+         } else {
+            aco_ptr<Instruction> branch = std::move(block->instructions.back());
+            block->instructions.pop_back();
+            transition_to_Exact(ctx, bld, idx);
+            bld.insert(std::move(branch));
+         }
+      }
+
+      unsigned num_exec_masks = ctx.info[idx].exec.size();
+      if (block->kind & block_kind_top_level)
+         num_exec_masks = std::min(num_exec_masks, 2u);
+
+      ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]],
+                            num_exec_masks,
+                            needs,
+                            has_divergent_break,
+                            has_divergent_continue,
+                            has_discard);
+   }
+
+   if (block->kind & block_kind_discard) {
+
+      assert(block->instructions.back()->format == Format::PSEUDO_BRANCH);
+      aco_ptr<Instruction> branch = std::move(block->instructions.back());
+      block->instructions.pop_back();
+
+      /* create a discard_if() instruction with the exec mask as condition */
+      unsigned num = 0;
+      if (ctx.loop.size()) {
+         /* if we're in a loop, only discard from the outer exec masks */
+         num = ctx.loop.back().num_exec_masks;
+      } else {
+         num = ctx.info[idx].exec.size() - 1;
+      }
+
+      Temp old_exec = ctx.info[idx].exec.back().first;
+      Temp new_exec = bld.tmp(s2);
+      Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+                           bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
+      ctx.info[idx].exec.back().first = new_exec;
+
+      aco_ptr<Pseudo_instruction> discard{create_instruction<Pseudo_instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)};
+      for (unsigned i = 0; i < num; i++) {
+         discard->operands[i] = Operand(ctx.info[block->index].exec[i].first);
+         Temp new_mask = bld.tmp(s2);
+         discard->definitions[i] = Definition(new_mask);
+         ctx.info[block->index].exec[i].first = new_mask;
+      }
+      assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
+      discard->operands[num] = Operand(cond);
+      discard->definitions[num] = bld.def(s1, scc);
+
+      bld.insert(std::move(discard));
+      if ((block->kind & (block_kind_break | block_kind_uniform)) == block_kind_break)
+         ctx.info[idx].exec.back().first = cond;
+      bld.insert(std::move(branch));
+      /* no return here as it can be followed by a divergent break */
+   }
+
+   if (block->kind & block_kind_continue_or_break) {
+      assert(block->instructions.back()->opcode == aco_opcode::p_branch);
+      block->instructions.pop_back();
+
+      /* because of how linear_succs is created, this needs to be swapped */
+      std::swap(block->linear_succs[0], block->linear_succs[1]);
+
+      assert(ctx.program->blocks[block->linear_succs[1]].kind & block_kind_loop_header);
+      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit);
+
+      if (ctx.info[idx].exec.back().second & mask_type_loop) {
+         bld.branch(aco_opcode::p_cbranch_nz, bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]);
+      } else {
+         Temp cond = Temp();
+         for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) {
+            if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) {
+               cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
+               break;
+            }
+         }
+         assert(cond != Temp());
+
+         bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
+      }
+      return;
+   }
+
+   if (block->kind & block_kind_uniform) {
+      Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(block->instructions.back().get());
+      if (branch->opcode == aco_opcode::p_branch) {
+         branch->target[0] = block->linear_succs[0];
+      } else {
+         branch->target[0] = block->linear_succs[1];
+         branch->target[1] = block->linear_succs[0];
+      }
+      return;
+   }
+
+   if (block->kind & block_kind_branch) {
+
+      if (ctx.handle_wqm &&
+          ctx.info[idx].exec.size() >= 2 &&
+          ctx.info[idx].exec.back().second == mask_type_exact &&
+          !(ctx.info[idx].block_needs & Exact_Branch) &&
+          ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
+         /* return to wqm before branching */
+         ctx.info[idx].exec.pop_back();
+      }
+
+      // orig = s_and_saveexec_b64
+      assert(block->linear_succs.size() == 2);
+      assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
+      Temp cond = block->instructions.back()->operands[0].getTemp();
+      block->instructions.pop_back();
+
+      if (ctx.info[idx].block_needs & Exact_Branch)
+         transition_to_Exact(ctx, bld, idx);
+
+      Temp current_exec = ctx.info[idx].exec.back().first;
+      uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
+
+      Temp then_mask = bld.tmp(s2);
+      Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+                               bld.exec(Definition(then_mask)), cond, bld.exec(current_exec));
+
+      ctx.info[idx].exec.back().first = old_exec;
+
+      /* add next current exec to the stack */
+      ctx.info[idx].exec.emplace_back(then_mask, mask_type);
+
+      bld.branch(aco_opcode::p_cbranch_z, bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]);
+      return;
+   }
+
+   if (block->kind & block_kind_invert) {
+      // exec = s_andn2_b64 (original_exec, exec)
+      assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_nz);
+      block->instructions.pop_back();
+      Temp then_mask = ctx.info[idx].exec.back().first;
+      uint8_t mask_type = ctx.info[idx].exec.back().second;
+      ctx.info[idx].exec.pop_back();
+      Temp orig_exec = ctx.info[idx].exec.back().first;
+      Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec),
+                                bld.def(s1, scc), orig_exec, bld.exec(then_mask));
+
+      /* add next current exec to the stack */
+      ctx.info[idx].exec.emplace_back(else_mask, mask_type);
+
+      bld.branch(aco_opcode::p_cbranch_z, bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]);
+      return;
+   }
+
+   if (block->kind & block_kind_break) {
+      // loop_mask = s_andn2_b64 (loop_mask, exec)
+      assert(block->instructions.back()->opcode == aco_opcode::p_branch);
+      block->instructions.pop_back();
+
+      Temp current_exec = ctx.info[idx].exec.back().first;
+      Temp cond = Temp();
+      for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
+         cond = bld.tmp(s1);
+         Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
+         exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
+                              exec_mask, current_exec);
+         ctx.info[idx].exec[exec_idx].first = exec_mask;
+         if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
+            break;
+      }
+
+      /* check if the successor is the merge block, otherwise set exec to 0 */
+      // TODO: this could be done better by directly branching to the merge block
+      unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
+      Block& succ = ctx.program->blocks[succ_idx];
+      if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
+         ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
+      }
+
+      bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
+      return;
+   }
+
+   if (block->kind & block_kind_continue) {
+      assert(block->instructions.back()->opcode == aco_opcode::p_branch);
+      block->instructions.pop_back();
+
+      Temp current_exec = ctx.info[idx].exec.back().first;
+      Temp cond = Temp();
+      for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
+         if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
+            break;
+         cond = bld.tmp(s1);
+         Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
+         exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
+                              exec_mask, bld.exec(current_exec));
+         ctx.info[idx].exec[exec_idx].first = exec_mask;
+      }
+      assert(cond != Temp());
+
+      /* check if the successor is the merge block, otherwise set exec to 0 */
+      // TODO: this could be done better by directly branching to the merge block
+      unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
+      Block& succ = ctx.program->blocks[succ_idx];
+      if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
+         ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
+      }
+
+      bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
+      return;
+   }
+}
+
+void process_block(exec_ctx& ctx, Block* block)
+{
+   std::vector<aco_ptr<Instruction>> instructions;
+   instructions.reserve(block->instructions.size());
+
+   unsigned idx = add_coupling_code(ctx, block, instructions);
+
+   assert(block->index != ctx.program->blocks.size() - 1 ||
+          ctx.info[block->index].exec.size() <= 2);
+
+   process_instructions(ctx, block, instructions, idx);
+
+   block->instructions = std::move(instructions);
+
+   add_branch_code(ctx, block);
+
+   block->live_out_exec = ctx.info[block->index].exec.back().first;
+}
+
+} /* end namespace */
+
+
+void insert_exec_mask(Program *program)
+{
+   exec_ctx ctx(program);
+
+   if (program->needs_wqm && program->needs_exact)
+      calculate_wqm_needs(ctx);
+
+   for (Block& block : program->blocks)
+      process_block(ctx, &block);
+
+}
+
+}
+
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
new file mode 100644 (file)
index 0000000..d19fdad
--- /dev/null
@@ -0,0 +1,697 @@
+/*
+ * Copyright © 2018 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <algorithm>
+#include <map>
+
+#include "aco_ir.h"
+#include "vulkan/radv_shader.h"
+
+namespace aco {
+
+namespace {
+
+/**
+ * The general idea of this pass is:
+ * The CFG is traversed in reverse postorder (forward).
+ * Per BB one wait_ctx is maintained.
+ * The in-context is the joined out-contexts of the predecessors.
+ * The context contains a map: gpr -> wait_entry
+ * consisting of the information about the cnt values to be waited for.
+ * Note: After merge-nodes, it might occur that for the same register
+ *       multiple cnt values are to be waited for.
+ *
+ * The values are updated according to the encountered instructions:
+ * - additional events increment the counter of waits of the same type
+ * - or erase gprs with counters higher than to be waited for.
+ */
+
+// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load
+
+/* Instructions of the same event will finish in-order except for smem
+ * and maybe flat. Instructions of different events may not finish in-order. */
+enum wait_event : uint16_t {
+   event_smem = 1 << 0,
+   event_lds = 1 << 1,
+   event_gds = 1 << 2,
+   event_vmem = 1 << 3,
+   event_vmem_store = 1 << 4, /* GFX10+ */
+   event_flat = 1 << 5,
+   event_exp_pos = 1 << 6,
+   event_exp_param = 1 << 7,
+   event_exp_mrt_null = 1 << 8,
+   event_gds_gpr_lock = 1 << 9,
+   event_vmem_gpr_lock = 1 << 10,
+};
+
+enum counter_type : uint8_t {
+   counter_exp = 1 << 0,
+   counter_lgkm = 1 << 1,
+   counter_vm = 1 << 2,
+   counter_vs = 1 << 3,
+};
+
+static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
+static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat;
+static const uint16_t vm_events = event_vmem | event_flat;
+static const uint16_t vs_events = event_vmem_store;
+
+uint8_t get_counters_for_event(wait_event ev)
+{
+   switch (ev) {
+   case event_smem:
+   case event_lds:
+   case event_gds:
+      return counter_lgkm;
+   case event_vmem:
+      return counter_vm;
+   case event_vmem_store:
+      return counter_vs;
+   case event_flat:
+      return counter_vm | counter_lgkm;
+   case event_exp_pos:
+   case event_exp_param:
+   case event_exp_mrt_null:
+   case event_gds_gpr_lock:
+   case event_vmem_gpr_lock:
+      return counter_exp;
+   default:
+      return 0;
+   }
+}
+
+struct wait_imm {
+   static const uint8_t unset_counter = 0xff;
+
+   uint8_t vm;
+   uint8_t exp;
+   uint8_t lgkm;
+   uint8_t vs;
+
+   wait_imm() :
+      vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {}
+   wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
+      vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
+
+   uint16_t pack(enum chip_class chip) const
+   {
+      uint16_t imm = 0;
+      assert(exp == unset_counter || exp <= 0x7);
+      switch (chip) {
+      case GFX10:
+         assert(lgkm == unset_counter || lgkm <= 0x3f);
+         assert(vm == unset_counter || vm <= 0x3f);
+         imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
+         break;
+      case GFX9:
+         assert(lgkm == unset_counter || lgkm <= 0xf);
+         assert(vm == unset_counter || vm <= 0x3f);
+         imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
+         break;
+      default:
+         assert(lgkm == unset_counter || lgkm <= 0xf);
+         assert(vm == unset_counter || vm <= 0xf);
+         imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
+         break;
+      }
+      if (chip < GFX9 && vm == wait_imm::unset_counter)
+         imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */
+      if (chip < GFX10 && lgkm == wait_imm::unset_counter)
+         imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */
+      return imm;
+   }
+
+   void combine(const wait_imm& other)
+   {
+      vm = std::min(vm, other.vm);
+      exp = std::min(exp, other.exp);
+      lgkm = std::min(lgkm, other.lgkm);
+      vs = std::min(vs, other.vs);
+   }
+
+   bool empty() const
+   {
+      return vm == unset_counter && exp == unset_counter &&
+             lgkm == unset_counter && vs == unset_counter;
+   }
+};
+
+struct wait_entry {
+   wait_imm imm;
+   uint16_t events; /* use wait_event notion */
+   uint8_t counters; /* use counter_type notion */
+   bool wait_on_read:1;
+   bool logical:1;
+
+   wait_entry(wait_event event, wait_imm imm, bool logical, bool wait_on_read)
+           : imm(imm), events(event), counters(get_counters_for_event(event)),
+             wait_on_read(wait_on_read), logical(logical) {}
+
+   void join(const wait_entry& other)
+   {
+      events |= other.events;
+      counters |= other.counters;
+      imm.combine(other.imm);
+      wait_on_read = wait_on_read || other.wait_on_read;
+      assert(logical == other.logical);
+   }
+
+   void remove_counter(counter_type counter)
+   {
+      counters &= ~counter;
+
+      if (counter == counter_lgkm) {
+         imm.lgkm = wait_imm::unset_counter;
+         events &= ~(event_smem | event_lds | event_gds);
+      }
+
+      if (counter == counter_vm) {
+         imm.vm = wait_imm::unset_counter;
+         events &= ~event_vmem;
+      }
+
+      if (counter == counter_exp) {
+         imm.exp = wait_imm::unset_counter;
+         events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock);
+      }
+
+      if (counter == counter_vs) {
+         imm.vs = wait_imm::unset_counter;
+         events &= ~event_vmem_store;
+      }
+
+      if (!(counters & counter_lgkm) && !(counters & counter_vm))
+         events &= ~event_flat;
+   }
+};
+
+struct wait_ctx {
+   Program *program;
+   enum chip_class chip_class;
+   uint16_t max_vm_cnt;
+   uint16_t max_exp_cnt;
+   uint16_t max_lgkm_cnt;
+   uint16_t max_vs_cnt;
+   uint16_t unordered_events = event_smem | event_flat;
+
+   uint8_t vm_cnt = 0;
+   uint8_t exp_cnt = 0;
+   uint8_t lgkm_cnt = 0;
+   uint8_t vs_cnt = 0;
+   bool pending_flat_lgkm = false;
+   bool pending_flat_vm = false;
+
+   wait_imm barrier_imm[barrier_count];
+
+   std::map<PhysReg,wait_entry> gpr_map;
+
+   wait_ctx() {}
+   wait_ctx(Program *program_)
+           : program(program_),
+             chip_class(program_->chip_class),
+             max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14),
+             max_exp_cnt(6),
+             max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
+             max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
+             unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
+
+   void join(const wait_ctx* other, bool logical)
+   {
+      exp_cnt = std::max(exp_cnt, other->exp_cnt);
+      vm_cnt = std::max(vm_cnt, other->vm_cnt);
+      lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
+      vs_cnt = std::max(vs_cnt, other->vs_cnt);
+      pending_flat_lgkm |= other->pending_flat_lgkm;
+      pending_flat_vm |= other->pending_flat_vm;
+
+      for (std::pair<PhysReg,wait_entry> entry : other->gpr_map)
+      {
+         std::map<PhysReg,wait_entry>::iterator it = gpr_map.find(entry.first);
+         if (entry.second.logical != logical)
+            continue;
+
+         if (it != gpr_map.end())
+            it->second.join(entry.second);
+         else
+            gpr_map.insert(entry);
+      }
+
+      for (unsigned i = 0; i < barrier_count; i++)
+         barrier_imm[i].combine(other->barrier_imm[i]);
+   }
+};
+
+wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
+{
+   wait_imm wait;
+
+   for (const Operand op : instr->operands) {
+      if (op.isConstant() || op.isUndefined())
+         continue;
+
+      /* check consecutively read gprs */
+      for (unsigned j = 0; j < op.size(); j++) {
+         PhysReg reg{op.physReg() + j};
+         std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
+         if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
+            continue;
+
+         wait.combine(it->second.imm);
+      }
+   }
+
+   for (const Definition& def : instr->definitions) {
+      /* check consecutively written gprs */
+      for (unsigned j = 0; j < def.getTemp().size(); j++)
+      {
+         PhysReg reg{def.physReg() + j};
+
+         std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
+         if (it == ctx.gpr_map.end())
+            continue;
+
+         /* Vector Memory reads and writes return in the order they were issued */
+         if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem)) {
+            it->second.remove_counter(counter_vm);
+            if (!it->second.counters)
+               it = ctx.gpr_map.erase(it);
+            continue;
+         }
+
+         /* LDS reads and writes return in the order they were issued. same for GDS */
+         if (instr->format == Format::DS) {
+            bool gds = static_cast<DS_instruction*>(instr)->gds;
+            if ((it->second.events & lgkm_events) == (gds ? event_gds : event_lds)) {
+               it->second.remove_counter(counter_lgkm);
+               if (!it->second.counters)
+                  it = ctx.gpr_map.erase(it);
+               continue;
+            }
+         }
+
+         wait.combine(it->second.imm);
+      }
+   }
+
+   return wait;
+}
+
+wait_imm kill(Instruction* instr, wait_ctx& ctx)
+{
+   wait_imm imm;
+   if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
+      imm.combine(check_instr(instr, ctx));
+
+   if (instr->format == Format::PSEUDO_BARRIER) {
+      unsigned* bsize = ctx.program->info->cs.block_size;
+      unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
+      switch (instr->opcode) {
+      case aco_opcode::p_memory_barrier_all:
+         for (unsigned i = 0; i < barrier_count; i++) {
+            if ((1 << i) == barrier_shared && workgroup_size <= 64)
+               continue;
+            imm.combine(ctx.barrier_imm[i]);
+         }
+         break;
+      case aco_opcode::p_memory_barrier_atomic:
+         imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
+         break;
+      /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
+      case aco_opcode::p_memory_barrier_buffer:
+      case aco_opcode::p_memory_barrier_image:
+         imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
+         imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
+         break;
+      case aco_opcode::p_memory_barrier_shared:
+         if (workgroup_size > 64)
+            imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
+         break;
+      default:
+         assert(false);
+         break;
+      }
+   }
+
+   if (!imm.empty()) {
+      if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
+         imm.vm = 0;
+      if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
+         imm.lgkm = 0;
+
+      /* reset counters */
+      ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp);
+      ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm);
+      ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm);
+      ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
+
+      /* update barrier wait imms */
+      for (unsigned i = 0; i < barrier_count; i++) {
+         wait_imm& bar = ctx.barrier_imm[i];
+         if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp)
+            bar.exp = wait_imm::unset_counter;
+         if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm)
+            bar.vm = wait_imm::unset_counter;
+         if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm)
+            bar.lgkm = wait_imm::unset_counter;
+         if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs)
+            bar.vs = wait_imm::unset_counter;
+      }
+
+      /* remove all vgprs with higher counter from map */
+      std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin();
+      while (it != ctx.gpr_map.end())
+      {
+         if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
+            it->second.remove_counter(counter_exp);
+         if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
+            it->second.remove_counter(counter_vm);
+         if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
+            it->second.remove_counter(counter_lgkm);
+         if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
+            it->second.remove_counter(counter_vs);
+         if (!it->second.counters)
+            it = ctx.gpr_map.erase(it);
+         else
+            it++;
+      }
+   }
+
+   if (imm.vm == 0)
+      ctx.pending_flat_vm = false;
+   if (imm.lgkm == 0)
+      ctx.pending_flat_lgkm = false;
+
+   return imm;
+}
+
+void update_barrier_imm(wait_ctx& ctx, uint8_t counters, barrier_interaction barrier)
+{
+   unsigned barrier_index = ffs(barrier) - 1;
+   for (unsigned i = 0; i < barrier_count; i++) {
+      wait_imm& bar = ctx.barrier_imm[i];
+      if (i == barrier_index) {
+         if (counters & counter_lgkm)
+            bar.lgkm = 0;
+         if (counters & counter_vm)
+            bar.vm = 0;
+         if (counters & counter_exp)
+            bar.exp = 0;
+         if (counters & counter_vs)
+            bar.vs = 0;
+      } else {
+         if (counters & counter_lgkm && bar.lgkm != wait_imm::unset_counter && bar.lgkm < ctx.max_lgkm_cnt)
+            bar.lgkm++;
+         if (counters & counter_vm && bar.vm != wait_imm::unset_counter && bar.vm < ctx.max_vm_cnt)
+            bar.vm++;
+         if (counters & counter_exp && bar.exp != wait_imm::unset_counter && bar.exp < ctx.max_exp_cnt)
+            bar.exp++;
+         if (counters & counter_vs && bar.vs != wait_imm::unset_counter && bar.vs < ctx.max_vs_cnt)
+            bar.vs++;
+      }
+   }
+}
+
+void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
+{
+   uint8_t counters = get_counters_for_event(event);
+
+   if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
+      ctx.lgkm_cnt++;
+   if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt)
+      ctx.vm_cnt++;
+   if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt)
+      ctx.exp_cnt++;
+   if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
+      ctx.vs_cnt++;
+
+   update_barrier_imm(ctx, counters, barrier);
+
+   if (ctx.unordered_events & event)
+      return;
+
+   if (ctx.pending_flat_lgkm)
+      counters &= ~counter_lgkm;
+   if (ctx.pending_flat_vm)
+      counters &= ~counter_vm;
+
+   for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) {
+      wait_entry& entry = e.second;
+
+      if (entry.events & ctx.unordered_events)
+         continue;
+
+      assert(entry.events);
+
+      if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt)
+         entry.imm.exp++;
+      if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt)
+         entry.imm.lgkm++;
+      if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt)
+         entry.imm.vm++;
+      if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt)
+         entry.imm.vs++;
+   }
+}
+
+void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
+{
+   assert(ctx.chip_class < GFX10);
+
+   if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
+      ctx.lgkm_cnt++;
+   if (ctx.lgkm_cnt <= ctx.max_vm_cnt)
+   ctx.vm_cnt++;
+
+   update_barrier_imm(ctx, counter_vm | counter_lgkm, barrier);
+
+   for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
+   {
+      if (e.second.counters & counter_vm)
+         e.second.imm.vm = 0;
+      if (e.second.counters & counter_lgkm)
+         e.second.imm.lgkm = 0;
+   }
+   ctx.pending_flat_lgkm = true;
+   ctx.pending_flat_vm = true;
+}
+
+void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read)
+{
+   uint16_t counters = get_counters_for_event(event);
+   wait_imm imm;
+   if (counters & counter_lgkm)
+      imm.lgkm = 0;
+   if (counters & counter_vm)
+      imm.vm = 0;
+   if (counters & counter_exp)
+      imm.exp = 0;
+   if (counters & counter_vs)
+      imm.vs = 0;
+
+   wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read);
+
+   for (unsigned i = 0; i < rc.size(); i++) {
+      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg+i}, new_entry);
+      if (!it.second)
+         it.first->second.join(new_entry);
+   }
+}
+
+void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event)
+{
+   if (!op.isConstant() && !op.isUndefined())
+      insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false);
+}
+
+void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event)
+{
+   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true);
+}
+
+void gen(Instruction* instr, wait_ctx& ctx)
+{
+   switch (instr->format) {
+   case Format::EXP: {
+      Export_instruction* exp_instr = static_cast<Export_instruction*>(instr);
+
+      wait_event ev;
+      if (exp_instr->dest <= 9)
+         ev = event_exp_mrt_null;
+      else if (exp_instr->dest <= 15)
+         ev = event_exp_pos;
+      else
+         ev = event_exp_param;
+      update_counters(ctx, ev);
+
+      /* insert new entries for exported vgprs */
+      for (unsigned i = 0; i < 4; i++)
+      {
+         if (exp_instr->enabled_mask & (1 << i)) {
+            unsigned idx = exp_instr->compressed ? i >> 1 : i;
+            assert(idx < exp_instr->operands.size());
+            insert_wait_entry(ctx, exp_instr->operands[idx], ev);
+
+         }
+      }
+      insert_wait_entry(ctx, exec, s2, ev, false);
+      break;
+   }
+   case Format::FLAT: {
+      if (ctx.chip_class < GFX10 && !instr->definitions.empty())
+         update_counters_for_flat_load(ctx, barrier_buffer);
+      else
+         update_counters(ctx, event_flat, barrier_buffer);
+
+      if (!instr->definitions.empty())
+         insert_wait_entry(ctx, instr->definitions[0], event_flat);
+      break;
+   }
+   case Format::SMEM: {
+      update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
+
+      if (!instr->definitions.empty())
+         insert_wait_entry(ctx, instr->definitions[0], event_smem);
+      break;
+   }
+   case Format::DS: {
+      bool gds = static_cast<DS_instruction*>(instr)->gds;
+      update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
+      if (gds)
+         update_counters(ctx, event_gds_gpr_lock);
+
+      if (!instr->definitions.empty())
+         insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
+
+      if (gds) {
+         for (const Operand& op : instr->operands)
+            insert_wait_entry(ctx, op, event_gds_gpr_lock);
+         insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
+      }
+      break;
+   }
+   case Format::MUBUF:
+   case Format::MTBUF:
+   case Format::MIMG:
+   case Format::GLOBAL: {
+      wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
+      update_counters(ctx, ev, get_barrier_interaction(instr));
+
+      if (!instr->definitions.empty())
+         insert_wait_entry(ctx, instr->definitions[0], ev);
+
+      if (instr->operands.size() == 4 && ctx.chip_class == GFX6) {
+         ctx.exp_cnt++;
+         update_counters(ctx, event_vmem_gpr_lock);
+         insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
+      }
+      break;
+   }
+   default:
+      break;
+   }
+}
+
+void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
+{
+   if (imm.vs != wait_imm::unset_counter) {
+      assert(ctx.chip_class >= GFX10);
+      SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 0);
+      waitcnt_vs->imm = imm.vs;
+      instructions.emplace_back(waitcnt_vs);
+      imm.vs = wait_imm::unset_counter;
+   }
+   if (!imm.empty()) {
+      SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
+      waitcnt->imm = imm.pack(ctx.chip_class);
+      waitcnt->block = -1;
+      instructions.emplace_back(waitcnt);
+   }
+}
+
+void handle_block(Program *program, Block& block, wait_ctx& ctx)
+{
+   std::vector<aco_ptr<Instruction>> new_instructions;
+
+   for (aco_ptr<Instruction>& instr : block.instructions) {
+      wait_imm imm = kill(instr.get(), ctx);
+
+      if (!imm.empty())
+         emit_waitcnt(ctx, new_instructions, imm);
+
+      gen(instr.get(), ctx);
+
+      if (instr->format != Format::PSEUDO_BARRIER)
+         new_instructions.emplace_back(std::move(instr));
+   }
+
+   /* check if this block is at the end of a loop */
+   for (unsigned succ_idx : block.linear_succs) {
+      /* eliminate any remaining counters */
+      if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt)) {
+         // TODO: we could do better if we only wait if the regs between the block and other predecessors differ
+
+         aco_ptr<Instruction> branch = std::move(new_instructions.back());
+         new_instructions.pop_back();
+
+         wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter,
+                      ctx.exp_cnt ? 0 : wait_imm::unset_counter,
+                      ctx.lgkm_cnt ? 0 : wait_imm::unset_counter,
+                      ctx.vs_cnt ? 0 : wait_imm::unset_counter);
+         emit_waitcnt(ctx, new_instructions, imm);
+
+         new_instructions.push_back(std::move(branch));
+
+         ctx = wait_ctx(program);
+         break;
+      }
+   }
+   block.instructions.swap(new_instructions);
+}
+
+} /* end namespace */
+
+void insert_wait_states(Program* program)
+{
+   wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */
+   for (unsigned i = 0; i < program->blocks.size(); i++)
+      out_ctx[i] = wait_ctx(program);
+
+   for (unsigned i = 0; i < program->blocks.size(); i++) {
+      Block& current = program->blocks[i];
+      wait_ctx& in = out_ctx[current.index];
+
+      for (unsigned b : current.linear_preds)
+         in.join(&out_ctx[b], false);
+      for (unsigned b : current.logical_preds)
+         in.join(&out_ctx[b], true);
+
+      if (current.instructions.empty())
+         continue;
+
+      handle_block(program, current, in);
+   }
+}
+
+}
+
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
new file mode 100644 (file)
index 0000000..d52043f
--- /dev/null
@@ -0,0 +1,7621 @@
+/*
+ * Copyright © 2018 Valve Corporation
+ * Copyright © 2018 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <algorithm>
+#include <map>
+
+#include "aco_ir.h"
+#include "aco_builder.h"
+#include "aco_interface.h"
+#include "aco_instruction_selection_setup.cpp"
+#include "util/fast_idiv_by_const.h"
+
+namespace aco {
+namespace {
+
+class loop_info_RAII {
+   isel_context* ctx;
+   unsigned header_idx_old;
+   Block* exit_old;
+   bool divergent_cont_old;
+   bool divergent_branch_old;
+   bool divergent_if_old;
+
+public:
+   loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
+      : ctx(ctx),
+        header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
+        divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
+        divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
+        divergent_if_old(ctx->cf_info.parent_if.is_divergent)
+   {
+      ctx->cf_info.parent_loop.header_idx = loop_header_idx;
+      ctx->cf_info.parent_loop.exit = loop_exit;
+      ctx->cf_info.parent_loop.has_divergent_continue = false;
+      ctx->cf_info.parent_loop.has_divergent_branch = false;
+      ctx->cf_info.parent_if.is_divergent = false;
+      ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
+   }
+
+   ~loop_info_RAII()
+   {
+      ctx->cf_info.parent_loop.header_idx = header_idx_old;
+      ctx->cf_info.parent_loop.exit = exit_old;
+      ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
+      ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
+      ctx->cf_info.parent_if.is_divergent = divergent_if_old;
+      ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
+      if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
+         ctx->cf_info.exec_potentially_empty = false;
+   }
+};
+
+struct if_context {
+   Temp cond;
+
+   bool divergent_old;
+   bool exec_potentially_empty_old;
+
+   unsigned BB_if_idx;
+   unsigned invert_idx;
+   bool then_branch_divergent;
+   Block BB_invert;
+   Block BB_endif;
+};
+
+static void visit_cf_list(struct isel_context *ctx,
+                          struct exec_list *list);
+
+static void add_logical_edge(unsigned pred_idx, Block *succ)
+{
+   succ->logical_preds.emplace_back(pred_idx);
+}
+
+
+static void add_linear_edge(unsigned pred_idx, Block *succ)
+{
+   succ->linear_preds.emplace_back(pred_idx);
+}
+
+static void add_edge(unsigned pred_idx, Block *succ)
+{
+   add_logical_edge(pred_idx, succ);
+   add_linear_edge(pred_idx, succ);
+}
+
+static void append_logical_start(Block *b)
+{
+   Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
+}
+
+static void append_logical_end(Block *b)
+{
+   Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
+}
+
+Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
+{
+   assert(ctx->allocated[def->index].id());
+   return ctx->allocated[def->index];
+}
+
+Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   if (!dst.id())
+      dst = bld.tmp(src.regClass());
+
+   if (ctx->stage != fragment_fs) {
+      if (!dst.id())
+         return src;
+
+      if (src.type() == RegType::vgpr || src.size() > 1)
+         bld.copy(Definition(dst), src);
+      else
+         bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
+      return dst;
+   }
+
+   bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
+   ctx->program->needs_wqm |= program_needs_wqm;
+   return dst;
+}
+
+Temp as_vgpr(isel_context *ctx, Temp val)
+{
+   if (val.type() == RegType::sgpr) {
+      Builder bld(ctx->program, ctx->block);
+      return bld.copy(bld.def(RegType::vgpr, val.size()), val);
+   }
+   assert(val.type() == RegType::vgpr);
+   return val;
+}
+
+//assumes a != 0xffffffff
+void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
+{
+   assert(b != 0);
+   Builder bld(ctx->program, ctx->block);
+
+   if (util_is_power_of_two_or_zero(b)) {
+      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
+      return;
+   }
+
+   util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
+
+   assert(info.multiplier <= 0xffffffff);
+
+   bool pre_shift = info.pre_shift != 0;
+   bool increment = info.increment != 0;
+   bool multiply = true;
+   bool post_shift = info.post_shift != 0;
+
+   if (!pre_shift && !increment && !multiply && !post_shift) {
+      bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
+      return;
+   }
+
+   Temp pre_shift_dst = a;
+   if (pre_shift) {
+      pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
+      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
+   }
+
+   Temp increment_dst = pre_shift_dst;
+   if (increment) {
+      increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
+      bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
+   }
+
+   Temp multiply_dst = increment_dst;
+   if (multiply) {
+      multiply_dst = post_shift ? bld.tmp(v1) : dst;
+      bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
+               bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
+   }
+
+   if (post_shift) {
+      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
+   }
+}
+
+void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
+}
+
+
+Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
+{
+   /* no need to extract the whole vector */
+   if (src.regClass() == dst_rc) {
+      assert(idx == 0);
+      return src;
+   }
+   assert(src.size() > idx);
+   Builder bld(ctx->program, ctx->block);
+   auto it = ctx->allocated_vec.find(src.id());
+   /* the size check needs to be early because elements other than 0 may be garbage */
+   if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
+      if (it->second[idx].regClass() == dst_rc) {
+         return it->second[idx];
+      } else {
+         assert(dst_rc.size() == it->second[idx].regClass().size());
+         assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
+         return bld.copy(bld.def(dst_rc), it->second[idx]);
+      }
+   }
+
+   if (src.size() == dst_rc.size()) {
+      assert(idx == 0);
+      return bld.copy(bld.def(dst_rc), src);
+   } else {
+      Temp dst = bld.tmp(dst_rc);
+      emit_extract_vector(ctx, src, idx, dst);
+      return dst;
+   }
+}
+
+void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
+{
+   if (num_components == 1)
+      return;
+   if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
+      return;
+   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
+   split->operands[0] = Operand(vec_src);
+   std::array<Temp,4> elems;
+   for (unsigned i = 0; i < num_components; i++) {
+      elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
+      split->definitions[i] = Definition(elems[i]);
+   }
+   ctx->block->instructions.emplace_back(std::move(split));
+   ctx->allocated_vec.emplace(vec_src.id(), elems);
+}
+
+/* This vector expansion uses a mask to determine which elements in the new vector
+ * come from the original vector. The other elements are undefined. */
+void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
+{
+   emit_split_vector(ctx, vec_src, util_bitcount(mask));
+
+   if (vec_src == dst)
+      return;
+
+   Builder bld(ctx->program, ctx->block);
+   if (num_components == 1) {
+      if (dst.type() == RegType::sgpr)
+         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
+      else
+         bld.copy(Definition(dst), vec_src);
+      return;
+   }
+
+   unsigned component_size = dst.size() / num_components;
+   std::array<Temp,4> elems;
+
+   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+   vec->definitions[0] = Definition(dst);
+   unsigned k = 0;
+   for (unsigned i = 0; i < num_components; i++) {
+      if (mask & (1 << i)) {
+         Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
+         if (dst.type() == RegType::sgpr)
+            src = bld.as_uniform(src);
+         vec->operands[i] = Operand(src);
+      } else {
+         vec->operands[i] = Operand(0u);
+      }
+      elems[i] = vec->operands[i].getTemp();
+   }
+   ctx->block->instructions.emplace_back(std::move(vec));
+   ctx->allocated_vec.emplace(dst.id(), elems);
+}
+
+Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint)
+{
+   if (val.regClass() == s2) {
+      return val;
+   } else {
+      assert(val.regClass() == s1);
+      Builder bld(ctx->program, ctx->block);
+      Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2),
+                                 Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0);
+      if (vcc_hint)
+         def.setHint(vcc);
+      return def.getTemp();
+   }
+}
+
+Temp as_uniform_bool(isel_context *ctx, Temp val)
+{
+   if (val.regClass() == s1) {
+      return val;
+   } else {
+      assert(val.regClass() == s2);
+      Builder bld(ctx->program, ctx->block);
+      return bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(0u), Operand(val));
+   }
+}
+
+Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
+{
+   if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
+      return get_ssa_temp(ctx, src.src.ssa);
+
+   if (src.src.ssa->num_components == size) {
+      bool identity_swizzle = true;
+      for (unsigned i = 0; identity_swizzle && i < size; i++) {
+         if (src.swizzle[i] != i)
+            identity_swizzle = false;
+      }
+      if (identity_swizzle)
+         return get_ssa_temp(ctx, src.src.ssa);
+   }
+
+   Temp vec = get_ssa_temp(ctx, src.src.ssa);
+   unsigned elem_size = vec.size() / src.src.ssa->num_components;
+   assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
+   assert(vec.size() % elem_size == 0);
+
+   RegClass elem_rc = RegClass(vec.type(), elem_size);
+   if (size == 1) {
+      return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
+   } else {
+      assert(size <= 4);
+      std::array<Temp,4> elems;
+      aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
+      for (unsigned i = 0; i < size; ++i) {
+         elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
+         vec_instr->operands[i] = Operand{elems[i]};
+      }
+      Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
+      vec_instr->definitions[0] = Definition(dst);
+      ctx->block->instructions.emplace_back(std::move(vec_instr));
+      ctx->allocated_vec.emplace(dst.id(), elems);
+      return dst;
+   }
+}
+
+Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
+{
+   if (ptr.size() == 2)
+      return ptr;
+   Builder bld(ctx->program, ctx->block);
+   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
+                     ptr, Operand((unsigned)ctx->options->address32_hi));
+}
+
+void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
+{
+   aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
+   sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
+   sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
+   sop2->definitions[0] = Definition(dst);
+   if (writes_scc)
+      sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
+   ctx->block->instructions.emplace_back(std::move(sop2));
+}
+
+void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
+   Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
+   if (src1.type() == RegType::sgpr) {
+      if (commutative && src0.type() == RegType::vgpr) {
+         Temp t = src0;
+         src0 = src1;
+         src1 = t;
+      } else if (src0.type() == RegType::vgpr &&
+                 op != aco_opcode::v_madmk_f32 &&
+                 op != aco_opcode::v_madak_f32 &&
+                 op != aco_opcode::v_madmk_f16 &&
+                 op != aco_opcode::v_madak_f16) {
+         /* If the instruction is not commutative, we emit a VOP3A instruction */
+         bld.vop2_e64(op, Definition(dst), src0, src1);
+         return;
+      } else {
+         src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
+      }
+   }
+   bld.vop2(op, Definition(dst), src0, src1);
+}
+
+void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+{
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+   Temp src2 = get_alu_src(ctx, instr->src[2]);
+
+   /* ensure that the instruction has at most 1 sgpr operand
+    * The optimizer will inline constants for us */
+   if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
+      src0 = as_vgpr(ctx, src0);
+   if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
+      src1 = as_vgpr(ctx, src1);
+   if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
+      src2 = as_vgpr(ctx, src2);
+
+   Builder bld(ctx->program, ctx->block);
+   bld.vop3(op, Definition(dst), src0, src1, src2);
+}
+
+void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
+}
+
+void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+{
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+   aco_ptr<Instruction> vopc;
+   if (src1.type() == RegType::sgpr) {
+      if (src0.type() == RegType::vgpr) {
+         /* to swap the operands, we might also have to change the opcode */
+         switch (op) {
+            case aco_opcode::v_cmp_lt_f32:
+               op = aco_opcode::v_cmp_gt_f32;
+               break;
+            case aco_opcode::v_cmp_ge_f32:
+               op = aco_opcode::v_cmp_le_f32;
+               break;
+            case aco_opcode::v_cmp_lt_i32:
+               op = aco_opcode::v_cmp_gt_i32;
+               break;
+            case aco_opcode::v_cmp_ge_i32:
+               op = aco_opcode::v_cmp_le_i32;
+               break;
+            case aco_opcode::v_cmp_lt_u32:
+               op = aco_opcode::v_cmp_gt_u32;
+               break;
+            case aco_opcode::v_cmp_ge_u32:
+               op = aco_opcode::v_cmp_le_u32;
+               break;
+            case aco_opcode::v_cmp_lt_f64:
+               op = aco_opcode::v_cmp_gt_f64;
+               break;
+            case aco_opcode::v_cmp_ge_f64:
+               op = aco_opcode::v_cmp_le_f64;
+               break;
+            case aco_opcode::v_cmp_lt_i64:
+               op = aco_opcode::v_cmp_gt_i64;
+               break;
+            case aco_opcode::v_cmp_ge_i64:
+               op = aco_opcode::v_cmp_le_i64;
+               break;
+            case aco_opcode::v_cmp_lt_u64:
+               op = aco_opcode::v_cmp_gt_u64;
+               break;
+            case aco_opcode::v_cmp_ge_u64:
+               op = aco_opcode::v_cmp_le_u64;
+               break;
+            default: /* eq and ne are commutative */
+               break;
+         }
+         Temp t = src0;
+         src0 = src1;
+         src1 = t;
+      } else {
+         src1 = as_vgpr(ctx, src1);
+      }
+   }
+   Builder bld(ctx->program, ctx->block);
+   bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc);
+}
+
+void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+{
+   if (dst.regClass() == s2) {
+      emit_vopc_instruction(ctx, instr, op, dst);
+      if (!ctx->divergent_vals[instr->dest.dest.ssa.index])
+         emit_split_vector(ctx, dst, 2);
+   } else if (dst.regClass() == s1) {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr);
+
+      Builder bld(ctx->program, ctx->block);
+      bld.sopc(op, bld.scc(Definition(dst)), src0, src1);
+
+   } else {
+      assert(false);
+   }
+}
+
+void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+   if (dst.regClass() == s2) {
+      bld.sop2(op64, Definition(dst), bld.def(s1, scc),
+               as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
+   } else {
+      assert(dst.regClass() == s1);
+      bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)),
+               as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
+   }
+}
+
+
+void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp cond = get_alu_src(ctx, instr->src[0]);
+   Temp then = get_alu_src(ctx, instr->src[1]);
+   Temp els = get_alu_src(ctx, instr->src[2]);
+
+   if (dst.type() == RegType::vgpr) {
+      cond = as_divergent_bool(ctx, cond, true);
+
+      aco_ptr<Instruction> bcsel;
+      if (dst.size() == 1) {
+         then = as_vgpr(ctx, then);
+         els = as_vgpr(ctx, els);
+
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
+      } else if (dst.size() == 2) {
+         Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
+         Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
+
+         Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
+         Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
+
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      return;
+   }
+
+   if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */
+      if (dst.regClass() == s1 || dst.regClass() == s2) {
+         assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
+         aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
+         bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond)));
+      } else {
+         fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      return;
+   }
+
+   /* boolean bcsel */
+   assert(instr->dest.dest.ssa.bit_size == 1);
+
+   if (dst.regClass() == s1)
+      cond = as_uniform_bool(ctx, cond);
+
+   if (cond.regClass() == s1) { /* uniform selection */
+      aco_opcode op;
+      if (dst.regClass() == s2) {
+         op = aco_opcode::s_cselect_b64;
+         then = as_divergent_bool(ctx, then, false);
+         els = as_divergent_bool(ctx, els, false);
+      } else {
+         assert(dst.regClass() == s1);
+         op = aco_opcode::s_cselect_b32;
+         then = as_uniform_bool(ctx, then);
+         els = as_uniform_bool(ctx, els);
+      }
+      bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
+      return;
+   }
+
+   /* divergent boolean bcsel
+    * this implements bcsel on bools: dst = s0 ? s1 : s2
+    * are going to be: dst = (s0 & s1) | (~s0 & s2) */
+   assert (dst.regClass() == s2);
+   then = as_divergent_bool(ctx, then, false);
+   els = as_divergent_bool(ctx, els, false);
+
+   if (cond.id() != then.id())
+      then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
+
+   if (cond.id() == els.id())
+      bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
+   else
+      bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
+               bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
+}
+
+void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
+{
+   if (!instr->dest.dest.is_ssa) {
+      fprintf(stderr, "nir alu dst not in ssa: ");
+      nir_print_instr(&instr->instr, stderr);
+      fprintf(stderr, "\n");
+      abort();
+   }
+   Builder bld(ctx->program, ctx->block);
+   Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
+   switch(instr->op) {
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4: {
+      std::array<Temp,4> elems;
+      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
+      for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
+         elems[i] = get_alu_src(ctx, instr->src[i]);
+         vec->operands[i] = Operand{elems[i]};
+      }
+      vec->definitions[0] = Definition(dst);
+      ctx->block->instructions.emplace_back(std::move(vec));
+      ctx->allocated_vec.emplace(dst.id(), elems);
+      break;
+   }
+   case nir_op_mov: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      aco_ptr<Instruction> mov;
+      if (dst.type() == RegType::sgpr) {
+         if (src.type() == RegType::vgpr)
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+         else if (src.regClass() == s1)
+            bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
+         else if (src.regClass() == s2)
+            bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
+         else
+            unreachable("wrong src register class for nir_op_imov");
+      } else if (dst.regClass() == v1) {
+         bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
+      } else if (dst.regClass() == v2) {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
+      } else {
+         nir_print_instr(&instr->instr, stderr);
+         unreachable("Should have been lowered to scalar.");
+      }
+      break;
+   }
+   case nir_op_inot: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      /* uniform booleans */
+      if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) {
+         if (src.regClass() == s1) {
+            /* in this case, src is either 1 or 0 */
+            bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src);
+         } else {
+            /* src is either exec_mask or 0 */
+            assert(src.regClass() == s2);
+            bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src);
+         }
+      } else if (dst.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
+      } else if (dst.type() == RegType::sgpr) {
+         aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
+         bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ineg: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == v1) {
+         bld.vsub32(Definition(dst), Operand(0u), Operand(src));
+      } else if (dst.regClass() == s1) {
+         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_iabs: {
+      if (dst.regClass() == s1) {
+         bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v1) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_isign: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s1) {
+         Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
+         Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
+         bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
+      } else if (dst.regClass() == s2) {
+         Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
+         Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
+         bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
+      } else if (dst.regClass() == v1) {
+         Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
+         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
+      } else if (dst.regClass() == v2) {
+         Temp upper = emit_extract_vector(ctx, src, 1, v1);
+         Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
+         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
+         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_imax: {
+      if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_umax: {
+      if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_imin: {
+      if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_umin: {
+      if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ior: {
+      if (instr->dest.dest.ssa.bit_size == 1) {
+         emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_iand: {
+      if (instr->dest.dest.ssa.bit_size == 1) {
+         emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ixor: {
+      if (instr->dest.dest.ssa.bit_size == 1) {
+         emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ushr: {
+      if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
+      } else if (dst.regClass() == v2) {
+         bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
+                  get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ishl: {
+      if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
+      } else if (dst.regClass() == v2) {
+         bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
+                  get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ishr: {
+      if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
+      } else if (dst.regClass() == v2) {
+         bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
+                  get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_find_lsb: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (src.regClass() == s1) {
+         bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
+      } else if (src.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
+      } else if (src.regClass() == s2) {
+         bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ufind_msb:
+   case nir_op_ifind_msb: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (src.regClass() == s1 || src.regClass() == s2) {
+         aco_opcode op = src.regClass() == s2 ?
+                         (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
+                         (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
+         Temp msb_rev = bld.sop1(op, bld.def(s1), src);
+
+         Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
+                                        Operand(src.size() * 32u - 1u), msb_rev);
+         Temp msb = sub.def(0).getTemp();
+         Temp carry = sub.def(1).getTemp();
+
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
+      } else if (src.regClass() == v1) {
+         aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
+         Temp msb_rev = bld.tmp(v1);
+         emit_vop1_instruction(ctx, instr, op, msb_rev);
+         Temp msb = bld.tmp(v1);
+         Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_bitfield_reverse: {
+      if (dst.regClass() == s1) {
+         bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v1) {
+         bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_iadd: {
+      if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
+         break;
+      }
+
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == v1) {
+         bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
+         break;
+      }
+
+      assert(src0.size() == 2 && src1.size() == 2);
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+
+      if (dst.regClass() == s2) {
+         Temp carry = bld.tmp(s1);
+         Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
+         Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else if (dst.regClass() == v2) {
+         Temp dst0 = bld.tmp(v1);
+         Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
+         Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_uadd_sat: {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
+         bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
+                  src0, src1);
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
+      } else if (dst.regClass() == v1) {
+         if (ctx->options->chip_class >= GFX9) {
+            aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
+            add->operands[0] = Operand(src0);
+            add->operands[1] = Operand(src1);
+            add->definitions[0] = Definition(dst);
+            add->clamp = 1;
+            ctx->block->instructions.emplace_back(std::move(add));
+         } else {
+            if (src1.regClass() != v1)
+               std::swap(src0, src1);
+            assert(src1.regClass() == v1);
+            Temp tmp = bld.tmp(v1);
+            Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
+            bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
+         }
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_uadd_carry: {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
+         break;
+      }
+      if (dst.regClass() == v1) {
+         Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
+         break;
+      }
+
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+      if (dst.regClass() == s2) {
+         Temp carry = bld.tmp(s1);
+         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
+         carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
+      } else if (dst.regClass() == v2) {
+         Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
+         carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
+         carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_isub: {
+      if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
+         break;
+      }
+
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == v1) {
+         bld.vsub32(Definition(dst), src0, src1);
+         break;
+      }
+
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+      if (dst.regClass() == s2) {
+         Temp carry = bld.tmp(s1);
+         Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
+         Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else if (dst.regClass() == v2) {
+         Temp lower = bld.tmp(v1);
+         Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
+         Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_usub_borrow: {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
+         break;
+      } else if (dst.regClass() == v1) {
+         Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
+         break;
+      }
+
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+      if (dst.regClass() == s2) {
+         Temp borrow = bld.tmp(s1);
+         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
+         borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
+      } else if (dst.regClass() == v2) {
+         Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
+         borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
+         borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_imul: {
+      if (dst.regClass() == v1) {
+         bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_umul_high: {
+      if (dst.regClass() == v1) {
+         bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
+      } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
+         bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
+      } else if (dst.regClass() == s1) {
+         Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
+                             as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_imul_high: {
+      if (dst.regClass() == v1) {
+         bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
+      } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
+         bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
+      } else if (dst.regClass() == s1) {
+         Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
+                             as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fmul: {
+      if (dst.size() == 1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
+      } else if (dst.size() == 2) {
+         bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
+                  as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fadd: {
+      if (dst.size() == 1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
+      } else if (dst.size() == 2) {
+         bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
+                  as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fsub: {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.size() == 1) {
+         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
+         else
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
+      } else if (dst.size() == 2) {
+         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
+                                     get_alu_src(ctx, instr->src[0]),
+                                     as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+         VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
+         sub->neg[1] = true;
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fmod:
+   case nir_op_frem: {
+      if (dst.size() == 1) {
+         Temp rcp = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_alu_src(ctx, instr->src[1]));
+         Temp mul = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]), rcp);
+
+         aco_opcode op = instr->op == nir_op_fmod ? aco_opcode::v_floor_f32 : aco_opcode::v_trunc_f32;
+         Temp floor = bld.vop1(op, bld.def(v1), mul);
+
+         mul = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), get_alu_src(ctx, instr->src[1]), floor);
+         bld.vop2(aco_opcode::v_sub_f32, Definition(dst), get_alu_src(ctx, instr->src[0]), mul);
+      } else if (dst.size() == 2) {
+         Temp rcp = bld.vop1(aco_opcode::v_rcp_f64, bld.def(v2), get_alu_src(ctx, instr->src[1]));
+         Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), get_alu_src(ctx, instr->src[0]), rcp);
+
+         aco_opcode op = instr->op == nir_op_fmod ? aco_opcode::v_floor_f64 : aco_opcode::v_trunc_f64;
+         Temp floor = bld.vop1(op, bld.def(v1), mul);
+
+         mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), get_alu_src(ctx, instr->src[1]), floor);
+         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), mul);
+         VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
+         sub->neg[1] = true;
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fmax: {
+      if (dst.size() == 1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
+      } else if (dst.size() == 2) {
+         bld.vop3(aco_opcode::v_max_f64, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]),
+                  as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fmin: {
+      if (dst.size() == 1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
+      } else if (dst.size() == 2) {
+         bld.vop3(aco_opcode::v_min_f64, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]),
+                  as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fmax3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fmin3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fmed3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_umax3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_umin3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_umed3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_imax3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_imin3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_imed3: {
+      if (dst.size() == 1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_cube_face_coord: {
+      Temp in = get_alu_src(ctx, instr->src[0], 3);
+      Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
+                      emit_extract_vector(ctx, in, 1, v1),
+                      emit_extract_vector(ctx, in, 2, v1) };
+      Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
+      ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
+      Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
+      Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
+      sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
+      tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
+      break;
+   }
+   case nir_op_cube_face_index: {
+      Temp in = get_alu_src(ctx, instr->src[0], 3);
+      Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
+                      emit_extract_vector(ctx, in, 1, v1),
+                      emit_extract_vector(ctx, in, 2, v1) };
+      bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
+      break;
+   }
+   case nir_op_bcsel: {
+      emit_bcsel(ctx, instr, dst);
+      break;
+   }
+   case nir_op_frsq: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst);
+      } else if (dst.size() == 2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fneg: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.size() == 1) {
+         bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
+      } else if (dst.size() == 2) {
+         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
+         upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fabs: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.size() == 1) {
+         bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
+      } else if (dst.size() == 2) {
+         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
+         upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fsat: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.size() == 1) {
+         bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
+      } else if (dst.size() == 2) {
+         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
+         VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
+         vop3->clamp = true;
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_flog2: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_frcp: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst);
+      } else if (dst.size() == 2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fexp2: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fsqrt: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst);
+      } else if (dst.size() == 2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ffract: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
+      } else if (dst.size() == 2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ffloor: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
+      } else if (dst.size() == 2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fceil: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
+      } else if (dst.size() == 2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ftrunc: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
+      } else if (dst.size() == 2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fround_even: {
+      if (dst.size() == 1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
+      } else if (dst.size() == 2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fsin:
+   case nir_op_fcos: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      aco_ptr<Instruction> norm;
+      if (dst.size() == 1) {
+         Temp tmp;
+         Operand half_pi(0x3e22f983u);
+         if (src.type() == RegType::sgpr)
+            tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
+         else
+            tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
+
+         /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
+         if (ctx->options->chip_class < GFX9)
+            tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
+
+         aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
+         bld.vop1(opcode, Definition(dst), tmp);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ldexp: {
+      if (dst.size() == 1) {
+         bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
+                  as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
+                  get_alu_src(ctx, instr->src[1]));
+      } else if (dst.size() == 2) {
+         bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
+                  as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
+                  get_alu_src(ctx, instr->src[1]));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_frexp_sig: {
+      if (dst.size() == 1) {
+         bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]));
+      } else if (dst.size() == 2) {
+         bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_frexp_exp: {
+      if (instr->src[0].src.ssa->bit_size == 32) {
+         bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]));
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fsign: {
+      Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
+      if (dst.size() == 1) {
+         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
+         cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
+      } else if (dst.size() == 2) {
+         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
+         Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, src, cond);
+
+         cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
+         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
+
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_f2f32: {
+      if (instr->src[0].src.ssa->bit_size == 64) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_f2f64: {
+      if (instr->src[0].src.ssa->bit_size == 32) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_i2f32: {
+      assert(dst.size() == 1);
+      emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
+      break;
+   }
+   case nir_op_i2f64: {
+      if (instr->src[0].src.ssa->bit_size == 32) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         RegClass rc = RegClass(src.type(), 1);
+         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
+         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
+         upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
+         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
+         bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
+
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_u2f32: {
+      assert(dst.size() == 1);
+      emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
+      break;
+   }
+   case nir_op_u2f64: {
+      if (instr->src[0].src.ssa->bit_size == 32) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         RegClass rc = RegClass(src.type(), 1);
+         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
+         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
+         upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
+         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
+         bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_f2i32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size == 32) {
+         if (dst.type() == RegType::vgpr)
+            bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
+         else
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                       bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
+
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         if (dst.type() == RegType::vgpr)
+            bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
+         else
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                       bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
+
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_f2u32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size == 32) {
+         if (dst.type() == RegType::vgpr)
+            bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
+         else
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                       bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
+
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         if (dst.type() == RegType::vgpr)
+            bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
+         else
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                       bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
+
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_f2i64: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
+         Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
+         exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
+         Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
+         Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
+         mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
+         mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
+         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
+         Temp new_exponent = bld.tmp(v1);
+         Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
+         mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
+         Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
+         Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
+         lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
+         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
+         lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
+         upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
+         Temp new_lower = bld.tmp(v1);
+         borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
+         Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
+
+      } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
+         if (src.type() == RegType::vgpr)
+            src = bld.as_uniform(src);
+         Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
+         exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
+         exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
+         exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
+         Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
+         Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
+         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
+         mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
+         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
+         exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
+         mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
+         Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
+         Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
+         mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
+         Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
+         lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
+         upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
+         Temp borrow = bld.tmp(s1);
+         lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
+         upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
+         Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
+         Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
+         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
+         Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
+         Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
+         Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
+         Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
+         if (dst.type() == RegType::sgpr) {
+            lower = bld.as_uniform(lower);
+            upper = bld.as_uniform(upper);
+         }
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_f2u64: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
+         Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
+         Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
+         exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
+         Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
+         mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
+         Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
+         Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
+         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
+         Temp new_exponent = bld.tmp(v1);
+         Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
+         mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
+         Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
+         lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
+         upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
+         lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
+         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+
+      } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
+         if (src.type() == RegType::vgpr)
+            src = bld.as_uniform(src);
+         Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
+         exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
+         exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
+         Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
+         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
+         Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
+         Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
+         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
+         Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
+         mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
+         Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
+         mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
+         Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
+         Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
+         lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
+         upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
+         Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
+         Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
+         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
+         Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
+         Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
+         Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
+         Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
+         if (dst.type() == RegType::sgpr) {
+            lower = bld.as_uniform(lower);
+            upper = bld.as_uniform(upper);
+         }
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_b2f32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s1) {
+         src = as_uniform_bool(ctx, src);
+         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
+      } else if (dst.regClass() == v1) {
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
+                      as_divergent_bool(ctx, src, true));
+      } else {
+         unreachable("Wrong destination register class for nir_op_b2f32.");
+      }
+      break;
+   }
+   case nir_op_b2f64: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s2) {
+         src = as_uniform_bool(ctx, src);
+         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
+      } else if (dst.regClass() == v2) {
+         Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
+         Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one,
+                      as_divergent_bool(ctx, src, true));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
+      } else {
+         unreachable("Wrong destination register class for nir_op_b2f64.");
+      }
+      break;
+   }
+   case nir_op_i2i32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size == 64) {
+         /* we can actually just say dst = src, as it would map the lower register */
+         emit_extract_vector(ctx, src, 0, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_u2u32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size == 16) {
+         if (dst.regClass() == s1) {
+            bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
+         } else {
+            // TODO: do better with SDWA
+            bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
+         }
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         /* we can actually just say dst = src, as it would map the lower register */
+         emit_extract_vector(ctx, src, 0, dst);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_i2i64: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size == 32) {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_u2u64: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size == 32) {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_b2i32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s1) {
+         if (src.regClass() == s1) {
+            bld.copy(Definition(dst), src);
+         } else {
+            // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
+            assert(src.regClass() == s2);
+            bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src);
+         }
+      } else {
+         assert(dst.regClass() == v1 && src.regClass() == s2);
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
+      }
+      break;
+   }
+   case nir_op_i2b1: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s2) {
+         assert(src.regClass() == v1 || src.regClass() == v2);
+         bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
+                  Definition(dst), Operand(0u), src).def(0).setHint(vcc);
+      } else {
+         assert(src.regClass() == s1 && dst.regClass() == s1);
+         bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src);
+      }
+      break;
+   }
+   case nir_op_pack_64_2x32_split: {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
+      break;
+   }
+   case nir_op_unpack_64_2x32_split_x:
+      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
+      break;
+   case nir_op_unpack_64_2x32_split_y:
+      bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
+      break;
+   case nir_op_pack_half_2x16: {
+      Temp src = get_alu_src(ctx, instr->src[0], 2);
+
+      if (dst.regClass() == v1) {
+         Temp src0 = bld.tmp(v1);
+         Temp src1 = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
+         bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
+
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_unpack_half_2x16_split_x: {
+      if (dst.regClass() == v1) {
+         Builder bld(ctx->program, ctx->block);
+         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_unpack_half_2x16_split_y: {
+      if (dst.regClass() == v1) {
+         Builder bld(ctx->program, ctx->block);
+         /* TODO: use SDWA here */
+         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
+                  bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_fquantize2f16: {
+      Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]));
+
+      Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
+
+      Temp cmp_res = bld.tmp(s2);
+      bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
+
+      Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
+
+      bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
+      break;
+   }
+   case nir_op_bfm: {
+      Temp bits = get_alu_src(ctx, instr->src[0]);
+      Temp offset = get_alu_src(ctx, instr->src[1]);
+
+      if (dst.regClass() == s1) {
+         bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
+      } else if (dst.regClass() == v1) {
+         bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_bitfield_select: {
+      /* (mask & insert) | (~mask & base) */
+      Temp bitmask = get_alu_src(ctx, instr->src[0]);
+      Temp insert = get_alu_src(ctx, instr->src[1]);
+      Temp base = get_alu_src(ctx, instr->src[2]);
+
+      /* dst = (insert & bitmask) | (base & ~bitmask) */
+      if (dst.regClass() == s1) {
+         aco_ptr<Instruction> sop2;
+         nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
+         nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
+         Operand lhs;
+         if (const_insert && const_bitmask) {
+            lhs = Operand(const_insert->u32 & const_bitmask->u32);
+         } else {
+            insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
+            lhs = Operand(insert);
+         }
+
+         Operand rhs;
+         nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
+         if (const_base && const_bitmask) {
+            rhs = Operand(const_base->u32 & ~const_bitmask->u32);
+         } else {
+            base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
+            rhs = Operand(base);
+         }
+
+         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
+
+      } else if (dst.regClass() == v1) {
+         if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
+            base = as_vgpr(ctx, base);
+         if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
+            insert = as_vgpr(ctx, insert);
+
+         bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
+
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ubfe:
+   case nir_op_ibfe: {
+      Temp base = get_alu_src(ctx, instr->src[0]);
+      Temp offset = get_alu_src(ctx, instr->src[1]);
+      Temp bits = get_alu_src(ctx, instr->src[2]);
+
+      if (dst.type() == RegType::sgpr) {
+         Operand extract;
+         nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
+         nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
+         if (const_offset && const_bits) {
+            uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
+            extract = Operand(const_extract);
+         } else {
+            Operand width;
+            if (const_bits) {
+               width = Operand(const_bits->u32 << 16);
+            } else {
+               width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
+            }
+            extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
+         }
+
+         aco_opcode opcode;
+         if (dst.regClass() == s1) {
+            if (instr->op == nir_op_ubfe)
+               opcode = aco_opcode::s_bfe_u32;
+            else
+               opcode = aco_opcode::s_bfe_i32;
+         } else if (dst.regClass() == s2) {
+            if (instr->op == nir_op_ubfe)
+               opcode = aco_opcode::s_bfe_u64;
+            else
+               opcode = aco_opcode::s_bfe_i64;
+         } else {
+            unreachable("Unsupported BFE bit size");
+         }
+
+         bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
+
+      } else {
+         aco_opcode opcode;
+         if (dst.regClass() == v1) {
+            if (instr->op == nir_op_ubfe)
+               opcode = aco_opcode::v_bfe_u32;
+            else
+               opcode = aco_opcode::v_bfe_i32;
+         } else {
+            unreachable("Unsupported BFE bit size");
+         }
+
+         emit_vop3a_instruction(ctx, instr, opcode, dst);
+      }
+      break;
+   }
+   case nir_op_bit_count: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (src.regClass() == s1) {
+         bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
+      } else if (src.regClass() == v1) {
+         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
+      } else if (src.regClass() == v2) {
+         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
+                  emit_extract_vector(ctx, src, 1, v1),
+                  bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
+                           emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
+      } else if (src.regClass() == s2) {
+         bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_flt: {
+      if (instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst);
+      else if (instr->src[0].src.ssa->bit_size == 64)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst);
+      break;
+   }
+   case nir_op_fge: {
+      if (instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst);
+      else if (instr->src[0].src.ssa->bit_size == 64)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst);
+      break;
+   }
+   case nir_op_feq: {
+      if (instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst);
+      else if (instr->src[0].src.ssa->bit_size == 64)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst);
+      break;
+   }
+   case nir_op_fne: {
+      if (instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst);
+      else if (instr->src[0].src.ssa->bit_size == 64)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst);
+      break;
+   }
+   case nir_op_ilt: {
+      if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst);
+      else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst);
+      else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst);
+      break;
+   }
+   case nir_op_ige: {
+      if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst);
+      else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst);
+      else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst);
+      break;
+   }
+   case nir_op_ieq: {
+      if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst);
+      } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
+         emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst);
+      } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst);
+      } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
+         emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst);
+      } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
+         Temp src0 = get_alu_src(ctx, instr->src[0]);
+         Temp src1 = get_alu_src(ctx, instr->src[1]);
+         bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)),
+                  as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
+      } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
+         Temp src0 = get_alu_src(ctx, instr->src[0]);
+         Temp src1 = get_alu_src(ctx, instr->src[1]);
+         bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc),
+                  as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ine: {
+      if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst);
+      } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst);
+      } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
+         emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst);
+      } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
+         emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst);
+      } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
+         Temp src0 = get_alu_src(ctx, instr->src[0]);
+         Temp src1 = get_alu_src(ctx, instr->src[1]);
+         bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)),
+                  as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
+      } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
+         Temp src0 = get_alu_src(ctx, instr->src[0]);
+         Temp src1 = get_alu_src(ctx, instr->src[1]);
+         bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc),
+                  as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_op_ult: {
+      if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst);
+      else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst);
+      else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst);
+      break;
+   }
+   case nir_op_uge: {
+      if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst);
+      else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
+         emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst);
+      else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
+         emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst);
+      break;
+   }
+   case nir_op_fddx:
+   case nir_op_fddy:
+   case nir_op_fddx_fine:
+   case nir_op_fddy_fine:
+   case nir_op_fddx_coarse:
+   case nir_op_fddy_coarse: {
+      Definition tl = bld.def(v1);
+      uint16_t dpp_ctrl;
+      if (instr->op == nir_op_fddx_fine) {
+         bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
+         dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
+      } else if (instr->op == nir_op_fddy_fine) {
+         bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
+         dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
+      } else {
+         bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
+         if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
+            dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
+         else
+            dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
+      }
+
+      Definition tmp = bld.def(v1);
+      bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
+      emit_wqm(ctx, tmp.getTemp(), dst, true);
+      break;
+   }
+   default:
+      fprintf(stderr, "Unknown NIR ALU instr: ");
+      nir_print_instr(&instr->instr, stderr);
+      fprintf(stderr, "\n");
+   }
+}
+
+void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
+{
+   Temp dst = get_ssa_temp(ctx, &instr->def);
+
+   // TODO: we really want to have the resulting type as this would allow for 64bit literals
+   // which get truncated the lsb if double and msb if int
+   // for now, we only use s_mov_b64 with 64bit inline constants
+   assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
+   assert(dst.type() == RegType::sgpr);
+
+   if (dst.size() == 1)
+   {
+      Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32));
+   } else {
+      assert(dst.size() != 1);
+      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+      if (instr->def.bit_size == 64)
+         for (unsigned i = 0; i < dst.size(); i++)
+            vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
+      else {
+         for (unsigned i = 0; i < dst.size(); i++)
+            vec->operands[i] = Operand{instr->value[i].u32};
+      }
+      vec->definitions[0] = Definition(dst);
+      ctx->block->instructions.emplace_back(std::move(vec));
+   }
+}
+
+uint32_t widen_mask(uint32_t mask, unsigned multiplier)
+{
+   uint32_t new_mask = 0;
+   for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
+      if (mask & (1u << i))
+         new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
+   return new_mask;
+}
+
+void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   /* This wouldn't work inside control flow or with indirect offsets but
+    * that doesn't happen because of nir_lower_io_to_temporaries(). */
+
+   unsigned write_mask = nir_intrinsic_write_mask(instr);
+   unsigned component = nir_intrinsic_component(instr);
+   Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+   unsigned idx = nir_intrinsic_base(instr) + component;
+
+   nir_instr *off_instr = instr->src[1].ssa->parent_instr;
+   if (off_instr->type != nir_instr_type_load_const) {
+      fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
+      nir_print_instr(off_instr, stderr);
+      fprintf(stderr, "\n");
+   }
+   idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
+
+   if (instr->src[0].ssa->bit_size == 64)
+      write_mask = widen_mask(write_mask, 2);
+
+   for (unsigned i = 0; i < 8; ++i) {
+      if (write_mask & (1 << i)) {
+         ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
+         ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
+      }
+      idx++;
+   }
+}
+
+void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   unsigned write_mask = nir_intrinsic_write_mask(instr);
+   Operand values[4];
+   Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+   for (unsigned i = 0; i < 4; ++i) {
+      if (write_mask & (1 << i)) {
+         Temp tmp = emit_extract_vector(ctx, src, i, v1);
+         values[i] = Operand(tmp);
+      } else {
+         values[i] = Operand(v1);
+      }
+   }
+
+   unsigned index = nir_intrinsic_base(instr) / 4;
+   unsigned target, col_format;
+   unsigned enabled_channels = 0xF;
+   aco_opcode compr_op = (aco_opcode)0;
+
+   nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
+   assert(offset && "Non-const offsets on exports not yet supported");
+   index += offset->u32;
+
+   assert(index != FRAG_RESULT_COLOR);
+
+   /* Unlike vertex shader exports, it's fine to use multiple exports to
+    * export separate channels of one target. So shaders which export both
+    * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
+    * TODO: combine the exports in those cases and create better code
+    */
+
+   if (index == FRAG_RESULT_SAMPLE_MASK) {
+
+      if (ctx->program->info->ps.writes_z) {
+         target = V_008DFC_SQ_EXP_MRTZ;
+         enabled_channels = 0x4;
+         col_format = (unsigned) -1;
+
+         values[2] = values[0];
+         values[0] = Operand(v1);
+      } else {
+         aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
+         exp->valid_mask = false;
+         exp->done = false;
+         exp->compressed = true;
+         exp->dest = V_008DFC_SQ_EXP_MRTZ;
+         exp->enabled_mask = 0xc;
+         for (int i = 0; i < 4; i++)
+            exp->operands[i] = Operand(v1);
+         exp->operands[1] = Operand(values[0]);
+         ctx->block->instructions.emplace_back(std::move(exp));
+         return;
+      }
+
+   } else if (index == FRAG_RESULT_DEPTH) {
+
+      target = V_008DFC_SQ_EXP_MRTZ;
+      enabled_channels = 0x1;
+      col_format = (unsigned) -1;
+
+   } else if (index == FRAG_RESULT_STENCIL) {
+
+      if (ctx->program->info->ps.writes_z) {
+         target = V_008DFC_SQ_EXP_MRTZ;
+         enabled_channels = 0x2;
+         col_format = (unsigned) -1;
+
+         values[1] = values[0];
+         values[0] = Operand(v1);
+      } else {
+         aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
+         shift->operands[0] = Operand((uint32_t) 16);
+         shift->operands[1] = values[0];
+         Temp tmp = {ctx->program->allocateId(), v1};
+         shift->definitions[0] = Definition(tmp);
+         ctx->block->instructions.emplace_back(std::move(shift));
+
+         aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
+         exp->valid_mask = false;
+         exp->done = false;
+         exp->compressed = true;
+         exp->dest = V_008DFC_SQ_EXP_MRTZ;
+         exp->enabled_mask = 0x3;
+         exp->operands[0] = Operand(tmp);
+         for (int i = 1; i < 4; i++)
+            exp->operands[i] = Operand(v1);
+         ctx->block->instructions.emplace_back(std::move(exp));
+         return;
+      }
+
+   } else {
+      index -= FRAG_RESULT_DATA0;
+      target = V_008DFC_SQ_EXP_MRT + index;
+      col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
+   }
+   ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
+   ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
+   assert(!is_int8 && !is_int10);
+
+   switch (col_format)
+   {
+   case V_028714_SPI_SHADER_ZERO:
+      enabled_channels = 0; /* writemask */
+      target = V_008DFC_SQ_EXP_NULL;
+      break;
+
+   case V_028714_SPI_SHADER_32_R:
+      enabled_channels = 1;
+      break;
+
+   case V_028714_SPI_SHADER_32_GR:
+      enabled_channels = 0x3;
+      break;
+
+   case V_028714_SPI_SHADER_32_AR:
+      enabled_channels = 0x9;
+      break;
+
+   case V_028714_SPI_SHADER_FP16_ABGR:
+      enabled_channels = 0x5;
+      compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
+      break;
+
+   case V_028714_SPI_SHADER_UNORM16_ABGR:
+      enabled_channels = 0x5;
+      compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+      break;
+
+   case V_028714_SPI_SHADER_SNORM16_ABGR:
+      enabled_channels = 0x5;
+      compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+      break;
+
+   case V_028714_SPI_SHADER_UINT16_ABGR:
+      enabled_channels = 0x5;
+      compr_op = aco_opcode::v_cvt_pk_u16_u32;
+      break;
+
+   case V_028714_SPI_SHADER_SINT16_ABGR:
+      enabled_channels = 0x5;
+      compr_op = aco_opcode::v_cvt_pk_i16_i32;
+      break;
+
+   case V_028714_SPI_SHADER_32_ABGR:
+      enabled_channels = 0xF;
+      break;
+
+   default:
+      break;
+   }
+
+   if (target == V_008DFC_SQ_EXP_NULL)
+      return;
+
+   if ((bool)compr_op)
+   {
+      for (int i = 0; i < 2; i++)
+      {
+         /* check if at least one of the values to be compressed is enabled */
+         unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
+         if (enabled) {
+            enabled_channels |= enabled << (i*2);
+            aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
+            Temp tmp{ctx->program->allocateId(), v1};
+            compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
+            compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
+            compr->definitions[0] = Definition(tmp);
+            values[i] = Operand(tmp);
+            ctx->block->instructions.emplace_back(std::move(compr));
+         } else {
+            values[i] = Operand(v1);
+         }
+      }
+   }
+
+   aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
+   exp->valid_mask = false;
+   exp->done = false;
+   exp->compressed = (bool) compr_op;
+   exp->dest = target;
+   exp->enabled_mask = enabled_channels;
+   if ((bool) compr_op) {
+      for (int i = 0; i < 2; i++)
+         exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
+      exp->operands[2] = Operand(v1);
+      exp->operands[3] = Operand(v1);
+   } else {
+      for (int i = 0; i < 4; i++)
+         exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
+   }
+
+   ctx->block->instructions.emplace_back(std::move(exp));
+}
+
+void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   if (ctx->stage == vertex_vs) {
+      visit_store_vs_output(ctx, instr);
+   } else if (ctx->stage == fragment_fs) {
+      visit_store_fs_output(ctx, instr);
+   } else {
+      unreachable("Shader stage not implemented");
+   }
+}
+
+void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
+{
+   Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
+   Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
+
+   Builder bld(ctx->program, ctx->block);
+   Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
+   bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
+}
+
+void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
+{
+   aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
+   for (unsigned i = 0; i < num_components; i++)
+      vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]);
+
+   if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
+      assert(num_components == 4);
+      Builder bld(ctx->program, ctx->block);
+      vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]);
+   }
+
+   for (Operand& op : vec->operands)
+      op = op.isUndefined() ? Operand(0u) : op;
+
+   vec->definitions[0] = Definition(dst);
+   ctx->block->instructions.emplace_back(std::move(vec));
+   emit_split_vector(ctx, dst, num_components);
+   return;
+}
+
+void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
+   unsigned idx = nir_intrinsic_base(instr);
+   unsigned component = nir_intrinsic_component(instr);
+   Temp prim_mask = ctx->prim_mask;
+
+   nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
+   if (offset) {
+      assert(offset->u32 == 0);
+   } else {
+      /* the lower 15bit of the prim_mask contain the offset into LDS
+       * while the upper bits contain the number of prims */
+      Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
+      assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
+      Builder bld(ctx->program, ctx->block);
+      Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
+      stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
+      stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
+      offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
+      prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
+   }
+
+   if (instr->dest.ssa.num_components == 1) {
+      emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
+   } else {
+      aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
+      for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
+      {
+         Temp tmp = {ctx->program->allocateId(), v1};
+         emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
+         vec->operands[i] = Operand(tmp);
+      }
+      vec->definitions[0] = Definition(dst);
+      ctx->block->instructions.emplace_back(std::move(vec));
+   }
+}
+
+unsigned get_num_channels_from_data_format(unsigned data_format)
+{
+   switch (data_format) {
+   case V_008F0C_BUF_DATA_FORMAT_8:
+   case V_008F0C_BUF_DATA_FORMAT_16:
+   case V_008F0C_BUF_DATA_FORMAT_32:
+      return 1;
+   case V_008F0C_BUF_DATA_FORMAT_8_8:
+   case V_008F0C_BUF_DATA_FORMAT_16_16:
+   case V_008F0C_BUF_DATA_FORMAT_32_32:
+      return 2;
+   case V_008F0C_BUF_DATA_FORMAT_10_11_11:
+   case V_008F0C_BUF_DATA_FORMAT_11_11_10:
+   case V_008F0C_BUF_DATA_FORMAT_32_32_32:
+      return 3;
+   case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
+   case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
+   case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
+   case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
+   case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
+      return 4;
+   default:
+      break;
+   }
+
+   return 4;
+}
+
+/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
+ * so we may need to fix it up. */
+Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
+      alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
+
+   /* For the integer-like cases, do a natural sign extension.
+    *
+    * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
+    * and happen to contain 0, 1, 2, 3 as the two LSBs of the
+    * exponent.
+    */
+   alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
+   alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
+
+   /* Convert back to the right type. */
+   if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
+      alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
+      Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
+      alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
+   } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
+      alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
+   }
+
+   return alpha;
+}
+
+void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   if (ctx->stage & sw_vs) {
+
+      nir_instr *off_instr = instr->src[0].ssa->parent_instr;
+      if (off_instr->type != nir_instr_type_load_const) {
+         fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
+         nir_print_instr(off_instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
+
+      Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers);
+
+      unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
+      unsigned component = nir_intrinsic_component(instr);
+      unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
+      uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
+      uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
+      unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
+
+      unsigned dfmt = attrib_format & 0xf;
+
+      unsigned nfmt = (attrib_format >> 4) & 0x7;
+      unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
+      unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
+      unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
+      unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
+      bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
+      if (post_shuffle)
+         num_channels = MAX2(num_channels, 3);
+
+      Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
+
+      Temp index;
+      if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
+         uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
+         if (divisor) {
+            ctx->needs_instance_id = true;
+
+            if (divisor != 1) {
+               Temp divided = bld.tmp(v1);
+               emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor);
+               index = bld.vadd32(bld.def(v1), ctx->start_instance, divided);
+            } else {
+               index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id);
+            }
+         } else {
+            index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance);
+         }
+      } else {
+         index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id);
+      }
+
+      if (attrib_stride != 0 && attrib_offset > attrib_stride) {
+         index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
+         attrib_offset = attrib_offset % attrib_stride;
+      }
+
+      Operand soffset(0u);
+      if (attrib_offset >= 4096) {
+         soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
+         attrib_offset = 0;
+      }
+
+      aco_opcode opcode;
+      switch (num_channels) {
+      case 1:
+         opcode = aco_opcode::tbuffer_load_format_x;
+         break;
+      case 2:
+         opcode = aco_opcode::tbuffer_load_format_xy;
+         break;
+      case 3:
+         opcode = aco_opcode::tbuffer_load_format_xyz;
+         break;
+      case 4:
+         opcode = aco_opcode::tbuffer_load_format_xyzw;
+         break;
+      default:
+         unreachable("Unimplemented load_input vector size");
+      }
+
+      Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
+
+      aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
+      mubuf->operands[0] = Operand(index);
+      mubuf->operands[1] = Operand(list);
+      mubuf->operands[2] = soffset;
+      mubuf->definitions[0] = Definition(tmp);
+      mubuf->idxen = true;
+      mubuf->can_reorder = true;
+      mubuf->dfmt = dfmt;
+      mubuf->nfmt = nfmt;
+      assert(attrib_offset < 4096);
+      mubuf->offset = attrib_offset;
+      ctx->block->instructions.emplace_back(std::move(mubuf));
+
+      emit_split_vector(ctx, tmp, tmp.size());
+
+      if (tmp.id() != dst.id()) {
+         bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
+                         nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
+
+         static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
+         static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
+         const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
+
+         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+         for (unsigned i = 0; i < dst.size(); i++) {
+            unsigned idx = i + component;
+            if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
+               Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
+               vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
+            } else if (idx < num_channels) {
+               vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
+            } else if (is_float && idx == 3) {
+               vec->operands[i] = Operand(0x3f800000u);
+            } else if (!is_float && idx == 3) {
+               vec->operands[i] = Operand(1u);
+            } else {
+               vec->operands[i] = Operand(0u);
+            }
+         }
+         vec->definitions[0] = Definition(dst);
+         ctx->block->instructions.emplace_back(std::move(vec));
+         emit_split_vector(ctx, dst, dst.size());
+      }
+
+   } else if (ctx->stage == fragment_fs) {
+      nir_instr *off_instr = instr->src[0].ssa->parent_instr;
+      if (off_instr->type != nir_instr_type_load_const ||
+          nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
+         fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
+         nir_print_instr(off_instr, stderr);
+         fprintf(stderr, "\n");
+      }
+
+      Temp prim_mask = ctx->prim_mask;
+      nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
+      if (offset) {
+         assert(offset->u32 == 0);
+      } else {
+         /* the lower 15bit of the prim_mask contain the offset into LDS
+          * while the upper bits contain the number of prims */
+         Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
+         assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
+         Builder bld(ctx->program, ctx->block);
+         Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
+         stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
+         stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
+         offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
+         prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
+      }
+
+      unsigned idx = nir_intrinsic_base(instr);
+      unsigned component = nir_intrinsic_component(instr);
+
+      if (dst.size() == 1) {
+         bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
+      } else {
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+         for (unsigned i = 0; i < dst.size(); i++)
+            vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
+         vec->definitions[0] = Definition(dst);
+         bld.insert(std::move(vec));
+      }
+
+   } else {
+      unreachable("Shader stage not implemented");
+   }
+}
+
+Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
+{
+   if (ctx->program->info->need_indirect_descriptor_sets) {
+      Builder bld(ctx->program, ctx->block);
+      Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]);
+      return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
+   }
+
+   return ctx->descriptor_sets[desc_set];
+}
+
+
+void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
+   unsigned desc_set = nir_intrinsic_desc_set(instr);
+   unsigned binding = nir_intrinsic_binding(instr);
+
+   Temp desc_ptr;
+   radv_pipeline_layout *pipeline_layout = ctx->options->layout;
+   radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
+   unsigned offset = layout->binding[binding].offset;
+   unsigned stride;
+   if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+       layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
+      unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
+      desc_ptr = ctx->push_constants;
+      offset = pipeline_layout->push_constant_size + 16 * idx;
+      stride = 16;
+   } else {
+      desc_ptr = load_desc_ptr(ctx, desc_set);
+      stride = layout->binding[binding].size;
+   }
+
+   nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
+   unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
+   if (stride != 1) {
+      if (nir_const_index) {
+         const_index = const_index * stride;
+      } else {
+         index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
+      }
+   }
+   if (offset) {
+      if (nir_const_index) {
+         const_index = const_index + offset;
+      } else {
+         index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
+      }
+   }
+
+   if (nir_const_index && const_index == 0) {
+      index = desc_ptr;
+   } else {
+      index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+                       nir_const_index ? Operand(const_index) : Operand(index),
+                       Operand(desc_ptr));
+   }
+
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   bld.sop1(aco_opcode::s_mov_b32, Definition(dst), index);
+}
+
+void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc, Temp offset, bool glc=false)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   unsigned num_bytes = dst.size() * 4;
+
+   aco_opcode op;
+   if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
+      if (ctx->options->chip_class < GFX8)
+         offset = as_vgpr(ctx, offset);
+
+      Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+      Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+      unsigned const_offset = 0;
+
+      Temp lower = Temp();
+      if (num_bytes > 16) {
+         assert(num_components == 3 || num_components == 4);
+         op = aco_opcode::buffer_load_dwordx4;
+         lower = bld.tmp(v4);
+         aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+         mubuf->definitions[0] = Definition(lower);
+         mubuf->operands[0] = vaddr;
+         mubuf->operands[1] = Operand(rsrc);
+         mubuf->operands[2] = soffset;
+         mubuf->offen = (offset.type() == RegType::vgpr);
+         mubuf->glc = glc;
+         mubuf->barrier = barrier_buffer;
+         bld.insert(std::move(mubuf));
+         emit_split_vector(ctx, lower, 2);
+         num_bytes -= 16;
+         const_offset = 16;
+      }
+
+      switch (num_bytes) {
+         case 4:
+            op = aco_opcode::buffer_load_dword;
+            break;
+         case 8:
+            op = aco_opcode::buffer_load_dwordx2;
+            break;
+         case 12:
+            op = aco_opcode::buffer_load_dwordx3;
+            break;
+         case 16:
+            op = aco_opcode::buffer_load_dwordx4;
+            break;
+         default:
+            unreachable("Load SSBO not implemented for this size.");
+      }
+      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+      mubuf->operands[0] = vaddr;
+      mubuf->operands[1] = Operand(rsrc);
+      mubuf->operands[2] = soffset;
+      mubuf->offen = (offset.type() == RegType::vgpr);
+      mubuf->glc = glc;
+      mubuf->barrier = barrier_buffer;
+      mubuf->offset = const_offset;
+      aco_ptr<Instruction> instr = std::move(mubuf);
+
+      if (dst.size() > 4) {
+         assert(lower != Temp());
+         Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
+         instr->definitions[0] = Definition(upper);
+         bld.insert(std::move(instr));
+         if (dst.size() == 8)
+            emit_split_vector(ctx, upper, 2);
+         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
+         instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
+         instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
+         instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
+         if (dst.size() == 8)
+            instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
+      }
+
+      if (dst.type() == RegType::sgpr) {
+         Temp vec = bld.tmp(RegType::vgpr, dst.size());
+         instr->definitions[0] = Definition(vec);
+         bld.insert(std::move(instr));
+         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
+      } else {
+         instr->definitions[0] = Definition(dst);
+         bld.insert(std::move(instr));
+      }
+   } else {
+      switch (num_bytes) {
+         case 4:
+            op = aco_opcode::s_buffer_load_dword;
+            break;
+         case 8:
+            op = aco_opcode::s_buffer_load_dwordx2;
+            break;
+         case 12:
+         case 16:
+            op = aco_opcode::s_buffer_load_dwordx4;
+            break;
+         case 24:
+         case 32:
+            op = aco_opcode::s_buffer_load_dwordx8;
+            break;
+         default:
+            unreachable("Load SSBO not implemented for this size.");
+      }
+      aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
+      load->operands[0] = Operand(rsrc);
+      load->operands[1] = Operand(bld.as_uniform(offset));
+      assert(load->operands[1].getTemp().type() == RegType::sgpr);
+      load->definitions[0] = Definition(dst);
+      load->glc = glc;
+      load->barrier = barrier_buffer;
+      assert(ctx->options->chip_class >= GFX8 || !glc);
+
+      /* trim vector */
+      if (dst.size() == 3) {
+         Temp vec = bld.tmp(s4);
+         load->definitions[0] = Definition(vec);
+         bld.insert(std::move(load));
+         emit_split_vector(ctx, vec, 4);
+
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+                    emit_extract_vector(ctx, vec, 0, s1),
+                    emit_extract_vector(ctx, vec, 1, s1),
+                    emit_extract_vector(ctx, vec, 2, s1));
+      } else if (dst.size() == 6) {
+         Temp vec = bld.tmp(s8);
+         load->definitions[0] = Definition(vec);
+         bld.insert(std::move(load));
+         emit_split_vector(ctx, vec, 4);
+
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+                    emit_extract_vector(ctx, vec, 0, s2),
+                    emit_extract_vector(ctx, vec, 1, s2),
+                    emit_extract_vector(ctx, vec, 2, s2));
+      } else {
+         bld.insert(std::move(load));
+      }
+
+   }
+   emit_split_vector(ctx, dst, num_components);
+}
+
+void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
+
+   Builder bld(ctx->program, ctx->block);
+
+   nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
+   unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
+   unsigned binding = nir_intrinsic_binding(idx_instr);
+   radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
+
+   if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+      uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+                           S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                           S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+      Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
+                                     Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
+                                     Operand(0xFFFFFFFFu),
+                                     Operand(desc_type));
+      rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
+                        rsrc, upper_dwords);
+   } else {
+      rsrc = convert_pointer_to_64_bit(ctx, rsrc);
+      rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
+   }
+
+   load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
+}
+
+void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   unsigned offset = nir_intrinsic_base(instr);
+   nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
+   if (index_cv && instr->dest.ssa.bit_size == 32) {
+
+      unsigned count = instr->dest.ssa.num_components;
+      unsigned start = (offset + index_cv->u32) / 4u;
+      start -= ctx->base_inline_push_consts;
+      if (start + count <= ctx->num_inline_push_consts) {
+         std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
+         for (unsigned i = 0; i < count; ++i) {
+            elems[i] = ctx->inline_push_consts[start + i];
+            vec->operands[i] = Operand{elems[i]};
+         }
+         vec->definitions[0] = Definition(dst);
+         ctx->block->instructions.emplace_back(std::move(vec));
+         ctx->allocated_vec.emplace(dst.id(), elems);
+         return;
+      }
+   }
+
+   Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
+   if (offset != 0) // TODO check if index != 0 as well
+      index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+   Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants);
+   Temp vec = dst;
+   bool trim = false;
+   aco_opcode op;
+
+   switch (dst.size()) {
+   case 1:
+      op = aco_opcode::s_load_dword;
+      break;
+   case 2:
+      op = aco_opcode::s_load_dwordx2;
+      break;
+   case 3:
+      vec = bld.tmp(s4);
+      trim = true;
+   case 4:
+      op = aco_opcode::s_load_dwordx4;
+      break;
+   case 6:
+      vec = bld.tmp(s8);
+      trim = true;
+   case 8:
+      op = aco_opcode::s_load_dwordx8;
+      break;
+   default:
+      unreachable("unimplemented or forbidden load_push_constant.");
+   }
+
+   bld.smem(op, Definition(vec), ptr, index);
+
+   if (trim) {
+      emit_split_vector(ctx, vec, 4);
+      RegClass rc = dst.size() == 3 ? s1 : s2;
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+                 emit_extract_vector(ctx, vec, 0, rc),
+                 emit_extract_vector(ctx, vec, 1, rc),
+                 emit_extract_vector(ctx, vec, 2, rc));
+
+   }
+   emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
+}
+
+void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   Builder bld(ctx->program, ctx->block);
+
+   uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                        S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+   if (ctx->options->chip_class >= GFX10) {
+      desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                   S_008F0C_OOB_SELECT(3) |
+                   S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+   }
+
+   unsigned base = nir_intrinsic_base(instr) + ctx->constant_data_offset;
+   unsigned range = nir_intrinsic_range(instr);
+
+   Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
+   if (base && offset.type() == RegType::sgpr)
+      offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+   else if (base && offset.type() == RegType::vgpr)
+      offset = bld.vadd32(bld.def(v1), Operand(base), offset);
+
+   Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
+                          bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(0u)),
+                          Operand(MIN2(range, ctx->shader->constant_data_size - nir_intrinsic_base(instr))),
+                          Operand(desc_type));
+
+   load_buffer(ctx, instr->num_components, dst, rsrc, offset);
+}
+
+void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+      ctx->cf_info.exec_potentially_empty = true;
+
+   ctx->program->needs_exact = true;
+
+   Builder bld(ctx->program, ctx->block);
+   Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
+   src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+   bld.pseudo(aco_opcode::p_discard_if, src);
+   ctx->block->kind |= block_kind_uses_discard_if;
+   return;
+}
+
+void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+      ctx->cf_info.exec_potentially_empty = true;
+
+   bool divergent = ctx->cf_info.parent_if.is_divergent ||
+                    ctx->cf_info.parent_loop.has_divergent_continue;
+
+   if (ctx->block->loop_nest_depth &&
+       ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
+      /* we handle discards the same way as jump instructions */
+      append_logical_end(ctx->block);
+
+      /* in loops, discard behaves like break */
+      Block *linear_target = ctx->cf_info.parent_loop.exit;
+      ctx->block->kind |= block_kind_discard;
+
+      if (!divergent) {
+         /* uniform discard - loop ends here */
+         assert(nir_instr_is_last(&instr->instr));
+         ctx->block->kind |= block_kind_uniform;
+         ctx->cf_info.has_branch = true;
+         bld.branch(aco_opcode::p_branch);
+         add_linear_edge(ctx->block->index, linear_target);
+         return;
+      }
+
+      /* we add a break right behind the discard() instructions */
+      ctx->block->kind |= block_kind_break;
+      unsigned idx = ctx->block->index;
+
+      /* remove critical edges from linear CFG */
+      bld.branch(aco_opcode::p_branch);
+      Block* break_block = ctx->program->create_and_insert_block();
+      break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
+      break_block->kind |= block_kind_uniform;
+      add_linear_edge(idx, break_block);
+      add_linear_edge(break_block->index, linear_target);
+      bld.reset(break_block);
+      bld.branch(aco_opcode::p_branch);
+
+      Block* continue_block = ctx->program->create_and_insert_block();
+      continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
+      add_linear_edge(idx, continue_block);
+      append_logical_start(continue_block);
+      ctx->block = continue_block;
+
+      return;
+   }
+
+   /* it can currently happen that NIR doesn't remove the unreachable code */
+   if (!nir_instr_is_last(&instr->instr)) {
+      ctx->program->needs_exact = true;
+      /* save exec somewhere temporarily so that it doesn't get
+       * overwritten before the discard from outer exec masks */
+      Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
+      bld.pseudo(aco_opcode::p_discard_if, cond);
+      ctx->block->kind |= block_kind_uses_discard_if;
+      return;
+   }
+
+   /* This condition is incorrect for uniformly branched discards in a loop
+    * predicated by a divergent condition, but the above code catches that case
+    * and the discard would end up turning into a discard_if.
+    * For example:
+    * if (divergent) {
+    *    while (...) {
+    *       if (uniform) {
+    *          discard;
+    *       }
+    *    }
+    * }
+    */
+   if (!ctx->cf_info.parent_if.is_divergent) {
+      /* program just ends here */
+      ctx->block->kind |= block_kind_uniform;
+      bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
+              0 /* enabled mask */, 9 /* dest */,
+              false /* compressed */, true/* done */, true /* valid mask */);
+      bld.sopp(aco_opcode::s_endpgm);
+      // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
+   } else {
+      ctx->block->kind |= block_kind_discard;
+      /* branch and linear edge is added by visit_if() */
+   }
+}
+
+enum aco_descriptor_type {
+   ACO_DESC_IMAGE,
+   ACO_DESC_FMASK,
+   ACO_DESC_SAMPLER,
+   ACO_DESC_BUFFER,
+   ACO_DESC_PLANE_0,
+   ACO_DESC_PLANE_1,
+   ACO_DESC_PLANE_2,
+};
+
+enum aco_image_dim {
+   aco_image_1d,
+   aco_image_2d,
+   aco_image_3d,
+   aco_image_cube, // includes cube arrays
+   aco_image_1darray,
+   aco_image_2darray,
+   aco_image_2dmsaa,
+   aco_image_2darraymsaa,
+};
+
+static enum aco_image_dim
+get_sampler_dim(isel_context *ctx, enum glsl_sampler_dim dim, bool is_array)
+{
+   switch (dim) {
+   case GLSL_SAMPLER_DIM_1D:
+      if (ctx->options->chip_class >= GFX9)
+         return is_array ? aco_image_2darray : aco_image_2d;
+      return is_array ? aco_image_1darray : aco_image_1d;
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_RECT:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      return is_array ? aco_image_2darray : aco_image_2d;
+   case GLSL_SAMPLER_DIM_3D:
+      return aco_image_3d;
+   case GLSL_SAMPLER_DIM_CUBE:
+      return aco_image_cube;
+   case GLSL_SAMPLER_DIM_MS:
+      return is_array ? aco_image_2darraymsaa : aco_image_2dmsaa;
+   case GLSL_SAMPLER_DIM_SUBPASS:
+      return aco_image_2darray;
+   case GLSL_SAMPLER_DIM_SUBPASS_MS:
+      return aco_image_2darraymsaa;
+   default:
+      unreachable("bad sampler dim");
+   }
+}
+
+static bool
+should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
+   if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      return false;
+   aco_image_dim dim = get_sampler_dim(ctx, sampler_dim, is_array);
+   return dim == aco_image_cube ||
+          dim == aco_image_1darray ||
+          dim == aco_image_2darray ||
+          dim == aco_image_2darraymsaa;
+}
+
+Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
+                      enum aco_descriptor_type desc_type,
+                      const nir_tex_instr *tex_instr, bool image, bool write)
+{
+/* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
+   std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
+   if (it != ctx->tex_desc.end())
+      return it->second;
+*/
+   Temp index = Temp();
+   bool index_set = false;
+   unsigned constant_index = 0;
+   unsigned descriptor_set;
+   unsigned base_index;
+   Builder bld(ctx->program, ctx->block);
+
+   if (!deref_instr) {
+      assert(tex_instr && !image);
+      descriptor_set = 0;
+      base_index = tex_instr->sampler_index;
+   } else {
+      while(deref_instr->deref_type != nir_deref_type_var) {
+         unsigned array_size = glsl_get_aoa_size(deref_instr->type);
+         if (!array_size)
+            array_size = 1;
+
+         assert(deref_instr->deref_type == nir_deref_type_array);
+         nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
+         if (const_value) {
+            constant_index += array_size * const_value->u32;
+         } else {
+            Temp indirect = bld.as_uniform(get_ssa_temp(ctx, deref_instr->arr.index.ssa));
+
+            if (array_size != 1)
+               indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
+
+            if (!index_set) {
+               index = indirect;
+               index_set = true;
+            } else {
+               index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
+            }
+         }
+
+         deref_instr = nir_src_as_deref(deref_instr->parent);
+      }
+      descriptor_set = deref_instr->var->data.descriptor_set;
+      base_index = deref_instr->var->data.binding;
+   }
+
+   Temp list = load_desc_ptr(ctx, descriptor_set);
+   list = convert_pointer_to_64_bit(ctx, list);
+
+   struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
+   struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
+   unsigned offset = binding->offset;
+   unsigned stride = binding->size;
+   aco_opcode opcode;
+   RegClass type;
+
+   assert(base_index < layout->binding_count);
+
+   switch (desc_type) {
+   case ACO_DESC_IMAGE:
+      type = s8;
+      opcode = aco_opcode::s_load_dwordx8;
+      break;
+   case ACO_DESC_FMASK:
+      type = s8;
+      opcode = aco_opcode::s_load_dwordx8;
+      offset += 32;
+      break;
+   case ACO_DESC_SAMPLER:
+      type = s4;
+      opcode = aco_opcode::s_load_dwordx4;
+      if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         offset += radv_combined_image_descriptor_sampler_offset(binding);
+      break;
+   case ACO_DESC_BUFFER:
+      type = s4;
+      opcode = aco_opcode::s_load_dwordx4;
+      break;
+   case ACO_DESC_PLANE_0:
+   case ACO_DESC_PLANE_1:
+      type = s8;
+      opcode = aco_opcode::s_load_dwordx8;
+      offset += 32 * (desc_type - ACO_DESC_PLANE_0);
+      break;
+   case ACO_DESC_PLANE_2:
+      type = s4;
+      opcode = aco_opcode::s_load_dwordx4;
+      offset += 64;
+      break;
+   default:
+      unreachable("invalid desc_type\n");
+   }
+
+   offset += constant_index * stride;
+
+   if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
+      (!index_set || binding->immutable_samplers_equal)) {
+      if (binding->immutable_samplers_equal)
+         constant_index = 0;
+
+      const uint32_t *samplers = radv_immutable_samplers(layout, binding);
+      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
+                        Operand(samplers[constant_index * 4 + 0]),
+                        Operand(samplers[constant_index * 4 + 1]),
+                        Operand(samplers[constant_index * 4 + 2]),
+                        Operand(samplers[constant_index * 4 + 3]));
+   }
+
+   Operand off;
+   if (!index_set) {
+      off = Operand(offset);
+   } else {
+      off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
+                                   bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
+   }
+
+   Temp res = bld.smem(opcode, bld.def(type), list, off);
+
+   if (desc_type == ACO_DESC_PLANE_2) {
+      Temp components[8];
+      for (unsigned i = 0; i < 8; i++)
+         components[i] = bld.tmp(s1);
+      bld.pseudo(aco_opcode::p_split_vector,
+                 Definition(components[0]),
+                 Definition(components[1]),
+                 Definition(components[2]),
+                 Definition(components[3]),
+                 res);
+
+      Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
+      bld.pseudo(aco_opcode::p_split_vector,
+                 bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
+                 Definition(components[4]),
+                 Definition(components[5]),
+                 Definition(components[6]),
+                 Definition(components[7]),
+                 desc2);
+
+      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
+                       components[0], components[1], components[2], components[3],
+                       components[4], components[5], components[6], components[7]);
+   }
+
+   return res;
+}
+
+static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
+{
+   switch (dim) {
+   case GLSL_SAMPLER_DIM_BUF:
+      return 1;
+   case GLSL_SAMPLER_DIM_1D:
+      return array ? 2 : 1;
+   case GLSL_SAMPLER_DIM_2D:
+      return array ? 3 : 2;
+   case GLSL_SAMPLER_DIM_MS:
+      return array ? 4 : 3;
+   case GLSL_SAMPLER_DIM_3D:
+   case GLSL_SAMPLER_DIM_CUBE:
+      return 3;
+   case GLSL_SAMPLER_DIM_RECT:
+   case GLSL_SAMPLER_DIM_SUBPASS:
+      return 2;
+   case GLSL_SAMPLER_DIM_SUBPASS_MS:
+      return 3;
+   default:
+      break;
+   }
+   return 0;
+}
+
+
+/* Adjust the sample index according to FMASK.
+ *
+ * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
+ * which is the identity mapping. Each nibble says which physical sample
+ * should be fetched to get that sample.
+ *
+ * For example, 0x11111100 means there are only 2 samples stored and
+ * the second sample covers 3/4 of the pixel. When reading samples 0
+ * and 1, return physical sample 0 (determined by the first two 0s
+ * in FMASK), otherwise return physical sample 1.
+ *
+ * The sample index should be adjusted as follows:
+ *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
+ */
+static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp fmask = bld.tmp(v1);
+
+   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
+   load->operands[0] = Operand(coords);
+   load->operands[1] = Operand(fmask_desc_ptr);
+   load->definitions[0] = Definition(fmask);
+   load->glc = false;
+   load->dmask = 0x1;
+   load->unrm = true;
+   load->da = da;
+   load->can_reorder = true; /* fmask images shouldn't be modified */
+   ctx->block->instructions.emplace_back(std::move(load));
+
+   Operand sample_index4;
+   if (sample_index.isConstant() && sample_index.constantValue() < 16) {
+      sample_index4 = Operand(sample_index.constantValue() << 2);
+   } else if (sample_index.regClass() == s1) {
+      sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
+   } else {
+      assert(sample_index.regClass() == v1);
+      sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
+   }
+
+   Temp final_sample;
+   if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
+      final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
+   else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
+      final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
+   else
+      final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
+
+   /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
+    * resource descriptor is 0 (invalid),
+    */
+   Temp compare = bld.tmp(s2);
+   bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
+                Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
+
+   Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
+
+   /* Replace the MSAA sample index. */
+   return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
+}
+
+static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
+{
+
+   Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
+   enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
+   bool is_array = glsl_sampler_type_is_array(type);
+   ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
+   assert(!add_frag_pos && "Input attachments should be lowered.");
+   bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
+   bool gfx9_1d = ctx->options->chip_class >= GFX9 && dim == GLSL_SAMPLER_DIM_1D;
+   int count = image_type_to_components_count(dim, is_array);
+   std::vector<Operand> coords(count);
+
+   if (is_ms) {
+      Operand sample_index;
+      nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
+      if (sample_cv)
+         sample_index = Operand(sample_cv->u32);
+      else
+         sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
+
+      if (instr->intrinsic == nir_intrinsic_image_deref_load) {
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
+         for (unsigned i = 0; i < vec->operands.size(); i++)
+            vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
+         Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
+         vec->definitions[0] = Definition(fmask_load_address);
+         ctx->block->instructions.emplace_back(std::move(vec));
+
+         Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
+         sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
+      }
+      count--;
+      coords[count] = sample_index;
+   }
+
+   if (count == 1 && !gfx9_1d)
+      return emit_extract_vector(ctx, src0, 0, v1);
+
+   if (gfx9_1d) {
+      coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
+      coords.resize(coords.size() + 1);
+      coords[1] = Operand((uint32_t) 0);
+      if (is_array)
+         coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
+   } else {
+      for (int i = 0; i < count; i++)
+         coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
+   }
+
+   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
+   for (unsigned i = 0; i < coords.size(); i++)
+      vec->operands[i] = coords[i];
+   Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
+   vec->definitions[0] = Definition(res);
+   ctx->block->instructions.emplace_back(std::move(vec));
+   return res;
+}
+
+
+void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+   const struct glsl_type *type = glsl_without_array(var->type);
+   const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   if (dim == GLSL_SAMPLER_DIM_BUF) {
+      unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
+      unsigned num_channels = util_last_bit(mask);
+      Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
+      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
+
+      aco_opcode opcode;
+      switch (num_channels) {
+      case 1:
+         opcode = aco_opcode::buffer_load_format_x;
+         break;
+      case 2:
+         opcode = aco_opcode::buffer_load_format_xy;
+         break;
+      case 3:
+         opcode = aco_opcode::buffer_load_format_xyz;
+         break;
+      case 4:
+         opcode = aco_opcode::buffer_load_format_xyzw;
+         break;
+      default:
+         unreachable(">4 channel buffer image load");
+      }
+      aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
+      load->operands[0] = Operand(vindex);
+      load->operands[1] = Operand(rsrc);
+      load->operands[2] = Operand((uint32_t) 0);
+      Temp tmp;
+      if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
+         tmp = dst;
+      else
+         tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
+      load->definitions[0] = Definition(tmp);
+      load->idxen = true;
+      load->barrier = barrier_image;
+      ctx->block->instructions.emplace_back(std::move(load));
+
+      expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
+      return;
+   }
+
+   Temp coords = get_image_coords(ctx, instr, type);
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
+   //aco_image_dim img_dim = get_image_dim(ctx, glsl_get_sampler_dim(type), glsl_sampler_type_is_array(type));
+
+   unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
+   unsigned num_components = util_bitcount(dmask);
+   Temp tmp;
+   if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
+      tmp = dst;
+   else
+      tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
+
+   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
+   load->operands[0] = Operand(coords);
+   load->operands[1] = Operand(resource);
+   load->definitions[0] = Definition(tmp);
+   load->glc = var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
+   load->dmask = dmask;
+   load->unrm = true;
+   load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
+   load->barrier = barrier_image;
+   ctx->block->instructions.emplace_back(std::move(load));
+
+   expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
+   return;
+}
+
+void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+   const struct glsl_type *type = glsl_without_array(var->type);
+   const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
+   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
+
+   bool glc = ctx->options->chip_class == GFX6 || var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
+
+   if (dim == GLSL_SAMPLER_DIM_BUF) {
+      Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
+      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
+      aco_opcode opcode;
+      switch (data.size()) {
+      case 1:
+         opcode = aco_opcode::buffer_store_format_x;
+         break;
+      case 2:
+         opcode = aco_opcode::buffer_store_format_xy;
+         break;
+      case 3:
+         opcode = aco_opcode::buffer_store_format_xyz;
+         break;
+      case 4:
+         opcode = aco_opcode::buffer_store_format_xyzw;
+         break;
+      default:
+         unreachable(">4 channel buffer image store");
+      }
+      aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
+      store->operands[0] = Operand(vindex);
+      store->operands[1] = Operand(rsrc);
+      store->operands[2] = Operand((uint32_t) 0);
+      store->operands[3] = Operand(data);
+      store->idxen = true;
+      store->glc = glc;
+      store->disable_wqm = true;
+      store->barrier = barrier_image;
+      ctx->program->needs_exact = true;
+      ctx->block->instructions.emplace_back(std::move(store));
+      return;
+   }
+
+   assert(data.type() == RegType::vgpr);
+   Temp coords = get_image_coords(ctx, instr, type);
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
+
+   aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
+   store->operands[0] = Operand(coords);
+   store->operands[1] = Operand(resource);
+   store->operands[2] = Operand(s4);
+   store->operands[3] = Operand(data);
+   store->glc = glc;
+   store->dmask = (1 << data.size()) - 1;
+   store->unrm = true;
+   store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
+   store->disable_wqm = true;
+   store->barrier = barrier_image;
+   ctx->program->needs_exact = true;
+   ctx->block->instructions.emplace_back(std::move(store));
+   return;
+}
+
+void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   /* return the previous value if dest is ever used */
+   bool return_previous = false;
+   nir_foreach_use_safe(use_src, &instr->dest.ssa) {
+      return_previous = true;
+      break;
+   }
+   nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
+      return_previous = true;
+      break;
+   }
+
+   const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+   const struct glsl_type *type = glsl_without_array(var->type);
+   const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
+   Builder bld(ctx->program, ctx->block);
+
+   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
+   assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
+
+   if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
+      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
+
+   aco_opcode buf_op, image_op;
+   switch (instr->intrinsic) {
+      case nir_intrinsic_image_deref_atomic_add:
+         buf_op = aco_opcode::buffer_atomic_add;
+         image_op = aco_opcode::image_atomic_add;
+         break;
+      case nir_intrinsic_image_deref_atomic_umin:
+         buf_op = aco_opcode::buffer_atomic_umin;
+         image_op = aco_opcode::image_atomic_umin;
+         break;
+      case nir_intrinsic_image_deref_atomic_imin:
+         buf_op = aco_opcode::buffer_atomic_smin;
+         image_op = aco_opcode::image_atomic_smin;
+         break;
+      case nir_intrinsic_image_deref_atomic_umax:
+         buf_op = aco_opcode::buffer_atomic_umax;
+         image_op = aco_opcode::image_atomic_umax;
+         break;
+      case nir_intrinsic_image_deref_atomic_imax:
+         buf_op = aco_opcode::buffer_atomic_smax;
+         image_op = aco_opcode::image_atomic_smax;
+         break;
+      case nir_intrinsic_image_deref_atomic_and:
+         buf_op = aco_opcode::buffer_atomic_and;
+         image_op = aco_opcode::image_atomic_and;
+         break;
+      case nir_intrinsic_image_deref_atomic_or:
+         buf_op = aco_opcode::buffer_atomic_or;
+         image_op = aco_opcode::image_atomic_or;
+         break;
+      case nir_intrinsic_image_deref_atomic_xor:
+         buf_op = aco_opcode::buffer_atomic_xor;
+         image_op = aco_opcode::image_atomic_xor;
+         break;
+      case nir_intrinsic_image_deref_atomic_exchange:
+         buf_op = aco_opcode::buffer_atomic_swap;
+         image_op = aco_opcode::image_atomic_swap;
+         break;
+      case nir_intrinsic_image_deref_atomic_comp_swap:
+         buf_op = aco_opcode::buffer_atomic_cmpswap;
+         image_op = aco_opcode::image_atomic_cmpswap;
+         break;
+      default:
+         unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
+   }
+
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   if (dim == GLSL_SAMPLER_DIM_BUF) {
+      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
+      Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
+      //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
+      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
+      mubuf->operands[0] = Operand(vindex);
+      mubuf->operands[1] = Operand(resource);
+      mubuf->operands[2] = Operand((uint32_t)0);
+      mubuf->operands[3] = Operand(data);
+      if (return_previous)
+         mubuf->definitions[0] = Definition(dst);
+      mubuf->offset = 0;
+      mubuf->idxen = true;
+      mubuf->glc = return_previous;
+      mubuf->disable_wqm = true;
+      mubuf->barrier = barrier_image;
+      ctx->program->needs_exact = true;
+      ctx->block->instructions.emplace_back(std::move(mubuf));
+      return;
+   }
+
+   Temp coords = get_image_coords(ctx, instr, type);
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
+   aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
+   mimg->operands[0] = Operand(coords);
+   mimg->operands[1] = Operand(resource);
+   mimg->operands[2] = Operand(s4); /* no sampler */
+   mimg->operands[3] = Operand(data);
+   if (return_previous)
+      mimg->definitions[0] = Definition(dst);
+   mimg->glc = return_previous;
+   mimg->dmask = (1 << data.size()) - 1;
+   mimg->unrm = true;
+   mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
+   mimg->disable_wqm = true;
+   mimg->barrier = barrier_image;
+   ctx->program->needs_exact = true;
+   ctx->block->instructions.emplace_back(std::move(mimg));
+   return;
+}
+
+void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
+{
+   if (in_elements && ctx->options->chip_class == GFX8) {
+      Builder bld(ctx->program, ctx->block);
+
+      Temp stride = emit_extract_vector(ctx, desc, 1, s1);
+      stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
+      stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
+      stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
+
+      Temp size = emit_extract_vector(ctx, desc, 2, s1);
+      size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
+
+      Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
+      res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
+      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
+
+      // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
+      /* idea
+       * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
+       * in case 12 (or 3?), we have to divide by 3:
+       * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
+       * use v_mul_hi_u32 with magic number to divide
+       * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
+       * disable v_skip
+       * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
+       */
+
+   } else {
+      emit_extract_vector(ctx, desc, 2, dst);
+   }
+}
+
+void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+   const struct glsl_type *type = glsl_without_array(var->type);
+   Builder bld(ctx->program, ctx->block);
+
+   if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
+      Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
+      return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
+   }
+
+   /* LOD */
+   Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
+
+   /* Resource */
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
+
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
+   mimg->operands[0] = Operand(lod);
+   mimg->operands[1] = Operand(resource);
+   unsigned& dmask = mimg->dmask;
+   mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
+   mimg->da = glsl_sampler_type_is_array(type);
+   mimg->can_reorder = true;
+   Definition& def = mimg->definitions[0];
+   ctx->block->instructions.emplace_back(std::move(mimg));
+
+   if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
+       glsl_sampler_type_is_array(type)) {
+
+      assert(instr->dest.ssa.num_components == 3);
+      Temp tmp = {ctx->program->allocateId(), v3};
+      def = Definition(tmp);
+      emit_split_vector(ctx, tmp, 3);
+
+      /* divide 3rd value by 6 by multiplying with magic number */
+      Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
+      Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
+
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+                 emit_extract_vector(ctx, tmp, 0, v1),
+                 emit_extract_vector(ctx, tmp, 1, v1),
+                 by_6);
+
+   } else if (ctx->options->chip_class >= GFX9 &&
+              glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
+              glsl_sampler_type_is_array(type)) {
+      assert(instr->dest.ssa.num_components == 2);
+      def = Definition(dst);
+      dmask = 0x5;
+   } else {
+      def = Definition(dst);
+   }
+
+   emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
+}
+
+void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   unsigned num_components = instr->num_components;
+
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+   rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
+
+   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+   load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc);
+}
+
+void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
+   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+   unsigned writemask = nir_intrinsic_write_mask(instr);
+
+   Temp offset;
+   if (ctx->options->chip_class < GFX8)
+      offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
+   else
+      offset = get_ssa_temp(ctx, instr->src[2].ssa);
+
+   Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+   rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
+
+   bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
+               ctx->options->chip_class >= GFX8;
+   if (smem)
+      offset = bld.as_uniform(offset);
+   bool smem_nonfs = smem && ctx->stage != fragment_fs;
+
+   while (writemask) {
+      int start, count;
+      u_bit_scan_consecutive_range(&writemask, &start, &count);
+      if (count == 3 && smem) {
+         writemask |= 1u << (start + 2);
+         count = 2;
+      }
+      int num_bytes = count * elem_size_bytes;
+
+      if (num_bytes > 16) {
+         assert(elem_size_bytes == 8);
+         writemask |= (((count - 2) << 1) - 1) << (start + 2);
+         count = 2;
+         num_bytes = 16;
+      }
+
+      // TODO: check alignment of sub-dword stores
+      // TODO: split 3 bytes. there is no store instruction for that
+
+      Temp write_data;
+      if (count != instr->num_components) {
+         emit_split_vector(ctx, data, instr->num_components);
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
+         for (int i = 0; i < count; i++) {
+            Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
+            vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
+         }
+         write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
+         vec->definitions[0] = Definition(write_data);
+         ctx->block->instructions.emplace_back(std::move(vec));
+      } else if (!smem && data.type() != RegType::vgpr) {
+         assert(num_bytes % 4 == 0);
+         write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
+      } else if (smem_nonfs && data.type() == RegType::vgpr) {
+         assert(num_bytes % 4 == 0);
+         write_data = bld.as_uniform(data);
+      } else {
+         write_data = data;
+      }
+
+      aco_opcode vmem_op, smem_op;
+      switch (num_bytes) {
+         case 4:
+            vmem_op = aco_opcode::buffer_store_dword;
+            smem_op = aco_opcode::s_buffer_store_dword;
+            break;
+         case 8:
+            vmem_op = aco_opcode::buffer_store_dwordx2;
+            smem_op = aco_opcode::s_buffer_store_dwordx2;
+            break;
+         case 12:
+            vmem_op = aco_opcode::buffer_store_dwordx3;
+            smem_op = aco_opcode::last_opcode;
+            assert(!smem);
+            break;
+         case 16:
+            vmem_op = aco_opcode::buffer_store_dwordx4;
+            smem_op = aco_opcode::s_buffer_store_dwordx4;
+            break;
+         default:
+            unreachable("Store SSBO not implemented for this size.");
+      }
+      if (ctx->stage == fragment_fs)
+         smem_op = aco_opcode::p_fs_buffer_store_smem;
+
+      if (smem) {
+         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
+         store->operands[0] = Operand(rsrc);
+         if (start) {
+            Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+                                offset, Operand(start * elem_size_bytes));
+            store->operands[1] = Operand(off);
+         } else {
+            store->operands[1] = Operand(offset);
+         }
+         if (smem_op != aco_opcode::p_fs_buffer_store_smem)
+            store->operands[1].setFixed(m0);
+         store->operands[2] = Operand(write_data);
+         store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+         store->disable_wqm = true;
+         store->barrier = barrier_buffer;
+         ctx->block->instructions.emplace_back(std::move(store));
+         ctx->program->wb_smem_l1_on_end = true;
+         if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
+            ctx->block->kind |= block_kind_needs_lowering;
+            ctx->program->needs_exact = true;
+         }
+      } else {
+         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
+         store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+         store->operands[1] = Operand(rsrc);
+         store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+         store->operands[3] = Operand(write_data);
+         store->offset = start * elem_size_bytes;
+         store->offen = (offset.type() == RegType::vgpr);
+         store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+         store->disable_wqm = true;
+         store->barrier = barrier_buffer;
+         ctx->program->needs_exact = true;
+         ctx->block->instructions.emplace_back(std::move(store));
+      }
+   }
+}
+
+void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   /* return the previous value if dest is ever used */
+   bool return_previous = false;
+   nir_foreach_use_safe(use_src, &instr->dest.ssa) {
+      return_previous = true;
+      break;
+   }
+   nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
+      return_previous = true;
+      break;
+   }
+
+   Builder bld(ctx->program, ctx->block);
+   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
+
+   if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
+      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
+                        get_ssa_temp(ctx, instr->src[3].ssa), data);
+
+   Temp offset;
+   if (ctx->options->chip_class < GFX8)
+      offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+   else
+      offset = get_ssa_temp(ctx, instr->src[1].ssa);
+
+   Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+   rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
+
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   aco_opcode op32, op64;
+   switch (instr->intrinsic) {
+      case nir_intrinsic_ssbo_atomic_add:
+         op32 = aco_opcode::buffer_atomic_add;
+         op64 = aco_opcode::buffer_atomic_add_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_imin:
+         op32 = aco_opcode::buffer_atomic_smin;
+         op64 = aco_opcode::buffer_atomic_smin_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_umin:
+         op32 = aco_opcode::buffer_atomic_umin;
+         op64 = aco_opcode::buffer_atomic_umin_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_imax:
+         op32 = aco_opcode::buffer_atomic_smax;
+         op64 = aco_opcode::buffer_atomic_smax_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_umax:
+         op32 = aco_opcode::buffer_atomic_umax;
+         op64 = aco_opcode::buffer_atomic_umax_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_and:
+         op32 = aco_opcode::buffer_atomic_and;
+         op64 = aco_opcode::buffer_atomic_and_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_or:
+         op32 = aco_opcode::buffer_atomic_or;
+         op64 = aco_opcode::buffer_atomic_or_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_xor:
+         op32 = aco_opcode::buffer_atomic_xor;
+         op64 = aco_opcode::buffer_atomic_xor_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_exchange:
+         op32 = aco_opcode::buffer_atomic_swap;
+         op64 = aco_opcode::buffer_atomic_swap_x2;
+         break;
+      case nir_intrinsic_ssbo_atomic_comp_swap:
+         op32 = aco_opcode::buffer_atomic_cmpswap;
+         op64 = aco_opcode::buffer_atomic_cmpswap_x2;
+         break;
+      default:
+         unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
+   }
+   aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
+   aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
+   mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+   mubuf->operands[1] = Operand(rsrc);
+   mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+   mubuf->operands[3] = Operand(data);
+   if (return_previous)
+      mubuf->definitions[0] = Definition(dst);
+   mubuf->offset = 0;
+   mubuf->offen = (offset.type() == RegType::vgpr);
+   mubuf->glc = return_previous;
+   mubuf->disable_wqm = true;
+   mubuf->barrier = barrier_buffer;
+   ctx->program->needs_exact = true;
+   ctx->block->instructions.emplace_back(std::move(mubuf));
+}
+
+void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
+
+   Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+   Builder bld(ctx->program, ctx->block);
+   Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
+   get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
+}
+
+void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   unsigned num_components = instr->num_components;
+   unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
+
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
+
+   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+   aco_opcode op;
+   if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
+      bool global = ctx->options->chip_class >= GFX9;
+      aco_opcode op;
+      switch (num_bytes) {
+      case 4:
+         op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
+         break;
+      case 8:
+         op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
+         break;
+      case 12:
+         op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
+         break;
+      case 16:
+         op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
+         break;
+      default:
+         unreachable("load_global not implemented for this size.");
+      }
+      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
+      flat->operands[0] = Operand(addr);
+      flat->operands[1] = Operand(s1);
+      flat->glc = glc;
+
+      if (dst.type() == RegType::sgpr) {
+         Temp vec = bld.tmp(RegType::vgpr, dst.size());
+         flat->definitions[0] = Definition(vec);
+         ctx->block->instructions.emplace_back(std::move(flat));
+         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
+      } else {
+         flat->definitions[0] = Definition(dst);
+         ctx->block->instructions.emplace_back(std::move(flat));
+      }
+      emit_split_vector(ctx, dst, num_components);
+   } else {
+      switch (num_bytes) {
+         case 4:
+            op = aco_opcode::s_load_dword;
+            break;
+         case 8:
+            op = aco_opcode::s_load_dwordx2;
+            break;
+         case 12:
+         case 16:
+            op = aco_opcode::s_load_dwordx4;
+            break;
+         default:
+            unreachable("load_global not implemented for this size.");
+      }
+      aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
+      load->operands[0] = Operand(addr);
+      load->operands[1] = Operand(0u);
+      load->definitions[0] = Definition(dst);
+      load->glc = glc;
+      load->barrier = barrier_buffer;
+      assert(ctx->options->chip_class >= GFX8 || !glc);
+
+      if (dst.size() == 3) {
+         /* trim vector */
+         Temp vec = bld.tmp(s4);
+         load->definitions[0] = Definition(vec);
+         ctx->block->instructions.emplace_back(std::move(load));
+         emit_split_vector(ctx, vec, 4);
+
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+                    emit_extract_vector(ctx, vec, 0, s1),
+                    emit_extract_vector(ctx, vec, 1, s1),
+                    emit_extract_vector(ctx, vec, 2, s1));
+      } else {
+         ctx->block->instructions.emplace_back(std::move(load));
+      }
+   }
+}
+
+void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+
+   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+   Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+
+   unsigned writemask = nir_intrinsic_write_mask(instr);
+   while (writemask) {
+      int start, count;
+      u_bit_scan_consecutive_range(&writemask, &start, &count);
+      unsigned num_bytes = count * elem_size_bytes;
+
+      Temp write_data = data;
+      if (count != instr->num_components) {
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
+         for (int i = 0; i < count; i++)
+            vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
+         write_data = bld.tmp(RegType::vgpr, count);
+         vec->definitions[0] = Definition(write_data);
+         ctx->block->instructions.emplace_back(std::move(vec));
+      }
+
+      unsigned offset = start * elem_size_bytes;
+      if (offset > 0 && ctx->options->chip_class < GFX9) {
+         Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
+         Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
+         Temp carry = bld.tmp(s2);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
+
+         bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
+                  Operand(offset), addr0);
+         bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
+                  Operand(0u), addr1,
+                  carry).def(1).setHint(vcc);
+
+         addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
+
+         offset = 0;
+      }
+
+      bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+      bool global = ctx->options->chip_class >= GFX9;
+      aco_opcode op;
+      switch (num_bytes) {
+      case 4:
+         op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
+         break;
+      case 8:
+         op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
+         break;
+      case 12:
+         op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
+         break;
+      case 16:
+         op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
+         break;
+      default:
+         unreachable("store_global not implemented for this size.");
+      }
+      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
+      flat->operands[0] = Operand(addr);
+      flat->operands[1] = Operand(s1);
+      flat->operands[2] = Operand(data);
+      flat->glc = glc;
+      flat->offset = offset;
+      ctx->block->instructions.emplace_back(std::move(flat));
+   }
+}
+
+void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
+   Builder bld(ctx->program, ctx->block);
+   switch(instr->intrinsic) {
+      case nir_intrinsic_group_memory_barrier:
+      case nir_intrinsic_memory_barrier:
+         bld.barrier(aco_opcode::p_memory_barrier_all);
+         break;
+      case nir_intrinsic_memory_barrier_atomic_counter:
+         bld.barrier(aco_opcode::p_memory_barrier_atomic);
+         break;
+      case nir_intrinsic_memory_barrier_buffer:
+         bld.barrier(aco_opcode::p_memory_barrier_buffer);
+         break;
+      case nir_intrinsic_memory_barrier_image:
+         bld.barrier(aco_opcode::p_memory_barrier_image);
+         break;
+      case nir_intrinsic_memory_barrier_shared:
+         bld.barrier(aco_opcode::p_memory_barrier_shared);
+         break;
+      default:
+         unreachable("Unimplemented memory barrier intrinsic");
+         break;
+   }
+}
+
+Operand load_lds_size_m0(isel_context *ctx)
+{
+   /* TODO: m0 does not need to be initialized on GFX9+ */
+   Builder bld(ctx->program, ctx->block);
+   return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
+}
+
+
+void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
+   Operand m = load_lds_size_m0(ctx);
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
+   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+   Builder bld(ctx->program, ctx->block);
+
+   unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
+   unsigned bytes_read = 0;
+   unsigned result_size = 0;
+   unsigned total_bytes = instr->num_components * elem_size_bytes;
+   unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : instr->dest.ssa.bit_size / 8;
+   std::array<Temp, 4> result;
+
+   while (bytes_read < total_bytes) {
+      unsigned todo = total_bytes - bytes_read;
+      bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
+      bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
+
+      aco_opcode op = aco_opcode::last_opcode;
+      if (todo >= 16 && aligned16) {
+         op = aco_opcode::ds_read_b128;
+         todo = 16;
+      } else if (todo >= 12 && aligned16) {
+         op = aco_opcode::ds_read_b96;
+         todo = 12;
+      } else if (todo >= 8) {
+         op = aligned8 ? aco_opcode::ds_read_b64 : aco_opcode::ds_read2_b32;
+         todo = 8;
+      } else if (todo >= 4) {
+         op = aco_opcode::ds_read_b32;
+         todo = 4;
+      } else {
+         assert(false);
+      }
+      assert(todo % elem_size_bytes == 0);
+      unsigned num_elements = todo / elem_size_bytes;
+      unsigned offset = nir_intrinsic_base(instr) + bytes_read;
+      unsigned max_offset = op == aco_opcode::ds_read2_b32 ? 1019 : 65535;
+
+      Temp address_offset = address;
+      if (offset > max_offset) {
+         address_offset = bld.vadd32(bld.def(v1), Operand((uint32_t)nir_intrinsic_base(instr)), address_offset);
+         offset = bytes_read;
+      }
+      assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
+
+      Temp res;
+      if (instr->num_components == 1 && dst.type() == RegType::vgpr)
+         res = dst;
+      else
+         res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
+
+      if (op == aco_opcode::ds_read2_b32)
+         res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
+      else
+         res = bld.ds(op, Definition(res), address_offset, m, offset);
+
+      if (instr->num_components == 1) {
+         assert(todo == total_bytes);
+         if (dst.type() == RegType::sgpr)
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
+         return;
+      }
+
+      if (dst.type() == RegType::sgpr)
+         res = bld.as_uniform(res);
+
+      if (num_elements == 1) {
+         result[result_size++] = res;
+      } else {
+         assert(res != dst && res.size() % num_elements == 0);
+         aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
+         split->operands[0] = Operand(res);
+         for (unsigned i = 0; i < num_elements; i++)
+            split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
+         ctx->block->instructions.emplace_back(std::move(split));
+      }
+
+      bytes_read += todo;
+   }
+
+   assert(result_size == instr->num_components && result_size > 1);
+   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
+   for (unsigned i = 0; i < result_size; i++)
+      vec->operands[i] = Operand(result[i]);
+   vec->definitions[0] = Definition(dst);
+   ctx->block->instructions.emplace_back(std::move(vec));
+   ctx->allocated_vec.emplace(dst.id(), result);
+}
+
+void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned offset0, unsigned offset1, unsigned align)
+{
+   Builder bld(ctx->program, ctx->block);
+   unsigned bytes_written = 0;
+   while (bytes_written < data.size() * 4) {
+      unsigned todo = data.size() * 4 - bytes_written;
+      bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
+      bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
+
+      aco_opcode op = aco_opcode::last_opcode;
+      unsigned size = 0;
+      if (todo >= 16 && aligned16) {
+         op = aco_opcode::ds_write_b128;
+         size = 4;
+      } else if (todo >= 12 && aligned16) {
+         op = aco_opcode::ds_write_b96;
+         size = 3;
+      } else if (todo >= 8) {
+         op = aligned8 ? aco_opcode::ds_write_b64 : aco_opcode::ds_write2_b32;
+         size = 2;
+      } else if (todo >= 4) {
+         op = aco_opcode::ds_write_b32;
+         size = 1;
+      } else {
+         assert(false);
+      }
+
+      bool write2 = op == aco_opcode::ds_write2_b32;
+      unsigned offset = offset0 + offset1 + bytes_written;
+      unsigned max_offset = write2 ? 1020 : 65535;
+      Temp address_offset = address;
+      if (offset > max_offset) {
+         address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
+         offset = offset1 + bytes_written;
+      }
+      assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
+
+      if (write2) {
+         Temp val0 = emit_extract_vector(ctx, data, bytes_written >> 2, v1);
+         Temp val1 = emit_extract_vector(ctx, data, (bytes_written >> 2) + 1, v1);
+         bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
+      } else {
+         Temp val = emit_extract_vector(ctx, data, bytes_written >> 2, RegClass(RegType::vgpr, size));
+         bld.ds(op, address_offset, val, m, offset);
+      }
+
+      bytes_written += size * 4;
+   }
+}
+
+void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   unsigned offset = nir_intrinsic_base(instr);
+   unsigned writemask = nir_intrinsic_write_mask(instr);
+   Operand m = load_lds_size_m0(ctx);
+   Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
+   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+   assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
+
+   /* we need at most two stores for 32bit variables */
+   int start[2], count[2];
+   u_bit_scan_consecutive_range(&writemask, &start[0], &count[0]);
+   u_bit_scan_consecutive_range(&writemask, &start[1], &count[1]);
+   assert(writemask == 0);
+
+   /* one combined store is sufficient */
+   if (count[0] == count[1]) {
+      Builder bld(ctx->program, ctx->block);
+
+      Temp address_offset = address;
+      if ((offset >> 2) + start[1] > 255) {
+         address_offset = bld.vadd32(bld.def(v1), Operand(offset), address_offset);
+         offset = 0;
+      }
+
+      assert(count[0] == 1);
+      Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
+      Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
+      aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
+      offset = offset / elem_size_bytes;
+      bld.ds(op, address_offset, val0, val1, m,
+             offset + start[0], offset + start[1]);
+      return;
+   }
+
+   unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
+   for (unsigned i = 0; i < 2; i++) {
+      if (count[i] == 0)
+         continue;
+
+      Temp write_data = emit_extract_vector(ctx, data, start[i], RegClass(RegType::vgpr, count[i] * elem_size_bytes / 4));
+      ds_write_helper(ctx, m, address, write_data, offset, start[i] * elem_size_bytes, align);
+   }
+   return;
+}
+
+void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   unsigned offset = nir_intrinsic_base(instr);
+   Operand m = load_lds_size_m0(ctx);
+   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+
+   unsigned num_operands = 3;
+   aco_opcode op32, op64, op32_rtn, op64_rtn;
+   switch(instr->intrinsic) {
+      case nir_intrinsic_shared_atomic_add:
+         op32 = aco_opcode::ds_add_u32;
+         op64 = aco_opcode::ds_add_u64;
+         op32_rtn = aco_opcode::ds_add_rtn_u32;
+         op64_rtn = aco_opcode::ds_add_rtn_u64;
+         break;
+      case nir_intrinsic_shared_atomic_imin:
+         op32 = aco_opcode::ds_min_i32;
+         op64 = aco_opcode::ds_min_i64;
+         op32_rtn = aco_opcode::ds_min_rtn_i32;
+         op64_rtn = aco_opcode::ds_min_rtn_i64;
+         break;
+      case nir_intrinsic_shared_atomic_umin:
+         op32 = aco_opcode::ds_min_u32;
+         op64 = aco_opcode::ds_min_u64;
+         op32_rtn = aco_opcode::ds_min_rtn_u32;
+         op64_rtn = aco_opcode::ds_min_rtn_u64;
+         break;
+      case nir_intrinsic_shared_atomic_imax:
+         op32 = aco_opcode::ds_max_i32;
+         op64 = aco_opcode::ds_max_i64;
+         op32_rtn = aco_opcode::ds_max_rtn_i32;
+         op64_rtn = aco_opcode::ds_max_rtn_i64;
+         break;
+      case nir_intrinsic_shared_atomic_umax:
+         op32 = aco_opcode::ds_max_u32;
+         op64 = aco_opcode::ds_max_u64;
+         op32_rtn = aco_opcode::ds_max_rtn_u32;
+         op64_rtn = aco_opcode::ds_max_rtn_u64;
+         break;
+      case nir_intrinsic_shared_atomic_and:
+         op32 = aco_opcode::ds_and_b32;
+         op64 = aco_opcode::ds_and_b64;
+         op32_rtn = aco_opcode::ds_and_rtn_b32;
+         op64_rtn = aco_opcode::ds_and_rtn_b64;
+         break;
+      case nir_intrinsic_shared_atomic_or:
+         op32 = aco_opcode::ds_or_b32;
+         op64 = aco_opcode::ds_or_b64;
+         op32_rtn = aco_opcode::ds_or_rtn_b32;
+         op64_rtn = aco_opcode::ds_or_rtn_b64;
+         break;
+      case nir_intrinsic_shared_atomic_xor:
+         op32 = aco_opcode::ds_xor_b32;
+         op64 = aco_opcode::ds_xor_b64;
+         op32_rtn = aco_opcode::ds_xor_rtn_b32;
+         op64_rtn = aco_opcode::ds_xor_rtn_b64;
+         break;
+      case nir_intrinsic_shared_atomic_exchange:
+         op32 = aco_opcode::ds_write_b32;
+         op64 = aco_opcode::ds_write_b64;
+         op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
+         op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
+         break;
+      case nir_intrinsic_shared_atomic_comp_swap:
+         op32 = aco_opcode::ds_cmpst_b32;
+         op64 = aco_opcode::ds_cmpst_b64;
+         op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
+         op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
+         num_operands = 4;
+         break;
+      default:
+         unreachable("Unhandled shared atomic intrinsic");
+   }
+
+   /* return the previous value if dest is ever used */
+   bool return_previous = false;
+   nir_foreach_use_safe(use_src, &instr->dest.ssa) {
+      return_previous = true;
+      break;
+   }
+   nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
+      return_previous = true;
+      break;
+   }
+
+   aco_opcode op;
+   if (data.size() == 1) {
+      assert(instr->dest.ssa.bit_size == 32);
+      op = return_previous ? op32_rtn : op32;
+   } else {
+      assert(instr->dest.ssa.bit_size == 64);
+      op = return_previous ? op64_rtn : op64;
+   }
+
+   if (offset > 65535) {
+      Builder bld(ctx->program, ctx->block);
+      address = bld.vadd32(bld.def(v1), Operand(offset), address);
+      offset = 0;
+   }
+
+   aco_ptr<DS_instruction> ds;
+   ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
+   ds->operands[0] = Operand(address);
+   ds->operands[1] = Operand(data);
+   if (num_operands == 4)
+      ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
+   ds->operands[num_operands - 1] = m;
+   ds->offset0 = offset;
+   if (return_previous)
+      ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
+   ctx->block->instructions.emplace_back(std::move(ds));
+}
+
+void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
+   assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
+   Builder bld(ctx->program, ctx->block);
+   Temp scratch_addr = ctx->private_segment_buffer;
+   if (ctx->stage != MESA_SHADER_COMPUTE)
+      scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
+   uint32_t rsrc_conf;
+   /* older generations need element size = 16 bytes */
+   if (ctx->program->chip_class >= GFX9)
+      rsrc_conf = 0x00E00000u;
+   else
+      rsrc_conf = 0x00F80000u;
+   /* buffer res = addr + num_records = -1, index_stride = 64, add_tid_enable = true */
+   Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
+   Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   aco_opcode op;
+   switch (dst.size()) {
+      case 1:
+         op = aco_opcode::buffer_load_dword;
+         break;
+      case 2:
+         op = aco_opcode::buffer_load_dwordx2;
+         break;
+      case 3:
+         op = aco_opcode::buffer_load_dwordx3;
+         break;
+      case 4:
+         op = aco_opcode::buffer_load_dwordx4;
+         break;
+      case 6:
+      case 8: {
+         std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+         Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
+                                bld.def(v4), offset, rsrc,
+                                ctx->scratch_offset, 0, true);
+         Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
+                                                  aco_opcode::buffer_load_dwordx4,
+                                dst.size() == 6 ? bld.def(v2) : bld.def(v4),
+                                offset, rsrc, ctx->scratch_offset, 16, true);
+         emit_split_vector(ctx, lower, 2);
+         elems[0] = emit_extract_vector(ctx, lower, 0, v2);
+         elems[1] = emit_extract_vector(ctx, lower, 1, v2);
+         if (dst.size() == 8) {
+            emit_split_vector(ctx, upper, 2);
+            elems[2] = emit_extract_vector(ctx, upper, 0, v2);
+            elems[3] = emit_extract_vector(ctx, upper, 1, v2);
+         } else {
+            elems[2] = upper;
+         }
+
+         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
+                                                                         Format::PSEUDO, dst.size() / 2, 1)};
+         for (unsigned i = 0; i < dst.size() / 2; i++)
+            vec->operands[i] = Operand(elems[i]);
+         vec->definitions[0] = Definition(dst);
+         bld.insert(std::move(vec));
+         ctx->allocated_vec.emplace(dst.id(), elems);
+         return;
+      }
+      default:
+         unreachable("Wrong dst size for nir_intrinsic_load_scratch");
+   }
+
+   bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true);
+   emit_split_vector(ctx, dst, instr->num_components);
+}
+
+void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
+   assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
+   Builder bld(ctx->program, ctx->block);
+   Temp scratch_addr = ctx->private_segment_buffer;
+   if (ctx->stage != MESA_SHADER_COMPUTE)
+      scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
+   uint32_t rsrc_conf;
+   /* older generations need element size = 16 bytes */
+   if (ctx->program->chip_class >= GFX9)
+      rsrc_conf = 0x00E00000u;
+   else
+      rsrc_conf = 0x00F80000u;
+   /* buffer res = addr + num_records = -1, index_stride = 64, add_tid_enable = true */
+   Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
+   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+   Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+
+   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+   unsigned writemask = nir_intrinsic_write_mask(instr);
+
+   while (writemask) {
+      int start, count;
+      u_bit_scan_consecutive_range(&writemask, &start, &count);
+      int num_bytes = count * elem_size_bytes;
+
+      if (num_bytes > 16) {
+         assert(elem_size_bytes == 8);
+         writemask |= (((count - 2) << 1) - 1) << (start + 2);
+         count = 2;
+         num_bytes = 16;
+      }
+
+      // TODO: check alignment of sub-dword stores
+      // TODO: split 3 bytes. there is no store instruction for that
+
+      Temp write_data;
+      if (count != instr->num_components) {
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
+         for (int i = 0; i < count; i++) {
+            Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
+            vec->operands[i] = Operand(elem);
+         }
+         write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
+         vec->definitions[0] = Definition(write_data);
+         ctx->block->instructions.emplace_back(std::move(vec));
+      } else {
+         write_data = data;
+      }
+
+      aco_opcode op;
+      switch (num_bytes) {
+         case 4:
+            op = aco_opcode::buffer_store_dword;
+            break;
+         case 8:
+            op = aco_opcode::buffer_store_dwordx2;
+            break;
+         case 12:
+            op = aco_opcode::buffer_store_dwordx3;
+            break;
+         case 16:
+            op = aco_opcode::buffer_store_dwordx4;
+            break;
+         default:
+            unreachable("Invalid data size for nir_intrinsic_store_scratch.");
+      }
+
+      bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true);
+   }
+}
+
+void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
+   uint8_t log2_ps_iter_samples;
+   if (ctx->program->info->ps.force_persample) {
+      log2_ps_iter_samples =
+         util_logbase2(ctx->options->key.fs.num_samples);
+   } else {
+      log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
+   }
+
+   /* The bit pattern matches that used by fixed function fragment
+    * processing. */
+   static const unsigned ps_iter_masks[] = {
+      0xffff, /* not used */
+      0x5555,
+      0x1111,
+      0x0101,
+      0x0001,
+   };
+   assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
+
+   Builder bld(ctx->program, ctx->block);
+
+   Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u));
+   Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
+   Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
+   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]);
+}
+
+Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   if (cluster_size == 1) {
+      return src;
+   } if (op == nir_op_iand && cluster_size == 4) {
+      //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
+      Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
+      return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
+                      bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
+   } else if (op == nir_op_ior && cluster_size == 4) {
+      //subgroupClusteredOr(val, 4) -> wqm(val & exec)
+      return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
+                      bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
+   } else if (op == nir_op_iand && cluster_size == 64) {
+      //subgroupAnd(val) -> (exec & ~val) == 0
+      Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
+      return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u));
+   } else if (op == nir_op_ior && cluster_size == 64) {
+      //subgroupOr(val) -> (val & exec) != 0
+      return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
+   } else if (op == nir_op_ixor && cluster_size == 64) {
+      //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
+      Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+      tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
+      return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
+   } else {
+      //subgroupClustered{And,Or,Xor}(val, n) ->
+      //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
+      //cluster_offset = ~(n - 1) & lane_id
+      //cluster_mask = ((1 << n) - 1)
+      //subgroupClusteredAnd():
+      //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
+      //subgroupClusteredOr():
+      //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
+      //subgroupClusteredXor():
+      //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
+      Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
+                              bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
+      Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
+
+      Temp tmp;
+      if (op == nir_op_iand)
+         tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+      else
+         tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+
+      uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
+      tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
+      tmp = emit_extract_vector(ctx, tmp, 0, v1);
+      if (cluster_mask != 0xffffffff)
+         tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
+
+      Definition cmp_def = Definition();
+      if (op == nir_op_iand) {
+         cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
+      } else if (op == nir_op_ior) {
+         cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
+      } else if (op == nir_op_ixor) {
+         tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
+                        bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
+         cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
+      }
+      cmp_def.setHint(vcc);
+      return cmp_def.getTemp();
+   }
+}
+
+Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
+   //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
+   //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
+   Temp tmp;
+   if (op == nir_op_iand)
+      tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
+   else
+      tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+
+   Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
+   Temp lo = lohi.def(0).getTemp();
+   Temp hi = lohi.def(1).getTemp();
+   Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
+                         bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
+
+   Definition cmp_def = Definition();
+   if (op == nir_op_iand)
+      cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
+   else if (op == nir_op_ior)
+      cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
+   else if (op == nir_op_ixor)
+      cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
+                         bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
+   cmp_def.setHint(vcc);
+   return cmp_def.getTemp();
+}
+
+Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
+   //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
+   //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
+   Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
+   if (op == nir_op_iand)
+      return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+   else if (op == nir_op_ior)
+      return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+   else if (op == nir_op_ixor)
+      return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+
+   assert(false);
+   return Temp();
+}
+
+void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
+{
+   Builder bld(ctx->program, ctx->block);
+   Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
+   if (src.regClass().type() == RegType::vgpr) {
+      bld.pseudo(aco_opcode::p_as_uniform, dst, src);
+   } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
+      bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src));
+   } else if (src.regClass() == s1) {
+      bld.sop1(aco_opcode::s_mov_b32, dst, src);
+   } else if (src.regClass() == s2) {
+      bld.sop1(aco_opcode::s_mov_b64, dst, src);
+   } else {
+      fprintf(stderr, "Unimplemented NIR instr bit size: ");
+      nir_print_instr(&instr->instr, stderr);
+      fprintf(stderr, "\n");
+   }
+}
+
+void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1];
+   Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2];
+
+   /* Build DD X/Y */
+   Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
+   Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
+   Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
+   Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
+   Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
+   Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
+
+   /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
+   Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
+   Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
+   tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
+   tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
+   Temp wqm1 = bld.tmp(v1);
+   emit_wqm(ctx, tmp1, wqm1, true);
+   Temp wqm2 = bld.tmp(v1);
+   emit_wqm(ctx, tmp2, wqm2, true);
+   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
+   return;
+}
+
+void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   switch(instr->intrinsic) {
+   case nir_intrinsic_load_barycentric_sample:
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_centroid: {
+      glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
+      fs_input input = get_interp_input(instr->intrinsic, mode);
+
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      if (input == fs_input::max_inputs) {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+                    Operand(0u), Operand(0u));
+      } else {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+                    ctx->fs_inputs[input],
+                    ctx->fs_inputs[input + 1]);
+      }
+      emit_split_vector(ctx, dst, 2);
+      break;
+   }
+   case nir_intrinsic_load_barycentric_at_sample: {
+      uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
+      switch (ctx->options->key.fs.num_samples) {
+         case 2: sample_pos_offset += 1 << 3; break;
+         case 4: sample_pos_offset += 3 << 3; break;
+         case 8: sample_pos_offset += 7 << 3; break;
+         default: break;
+      }
+      Temp sample_pos;
+      Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
+      nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
+      if (addr.type() == RegType::sgpr) {
+         Operand offset;
+         if (const_addr) {
+            sample_pos_offset += const_addr->u32 << 3;
+            offset = Operand(sample_pos_offset);
+         } else if (ctx->options->chip_class >= GFX9) {
+            offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
+         } else {
+            offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
+            offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
+         }
+         addr = ctx->private_segment_buffer;
+         sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset));
+
+      } else if (ctx->options->chip_class >= GFX9) {
+         addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
+         sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset);
+      } else {
+         /* addr += ctx->private_segment_buffer + sample_pos_offset */
+         Temp tmp0 = bld.tmp(s1);
+         Temp tmp1 = bld.tmp(s1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer);
+         Definition scc_tmp = bld.def(s1, scc);
+         tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
+         tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), scc_tmp.getTemp());
+         addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
+         Temp pck0 = bld.tmp(v1);
+         Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
+         tmp1 = as_vgpr(ctx, tmp1);
+         Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
+         addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
+
+         /* sample_pos = flat_load_dwordx2 addr */
+         sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
+      }
+
+      /* sample_pos -= 0.5 */
+      Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
+      Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
+      bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
+      pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
+      pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
+
+      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
+      break;
+   }
+   case nir_intrinsic_load_barycentric_at_offset: {
+      Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
+      RegClass rc = RegClass(offset.type(), 1);
+      Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
+      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
+      break;
+   }
+   case nir_intrinsic_load_front_face: {
+      bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc);
+      break;
+   }
+   case nir_intrinsic_load_view_index:
+   case nir_intrinsic_load_layer_id: {
+      if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
+         Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+         bld.copy(Definition(dst), Operand(ctx->view_index));
+         break;
+      }
+
+      unsigned idx = nir_intrinsic_base(instr);
+      bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+                 Operand(2u), bld.m0(ctx->prim_mask), idx, 0);
+      break;
+   }
+   case nir_intrinsic_load_frag_coord: {
+      emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
+      break;
+   }
+   case nir_intrinsic_load_sample_pos: {
+      Temp posx = ctx->fs_inputs[fs_input::frag_pos_0];
+      Temp posy = ctx->fs_inputs[fs_input::frag_pos_1];
+      bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+                 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
+                 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
+      break;
+   }
+   case nir_intrinsic_load_interpolated_input:
+      visit_load_interpolated_input(ctx, instr);
+      break;
+   case nir_intrinsic_store_output:
+      visit_store_output(ctx, instr);
+      break;
+   case nir_intrinsic_load_input:
+      visit_load_input(ctx, instr);
+      break;
+   case nir_intrinsic_load_ubo:
+      visit_load_ubo(ctx, instr);
+      break;
+   case nir_intrinsic_load_push_constant:
+      visit_load_push_constant(ctx, instr);
+      break;
+   case nir_intrinsic_load_constant:
+      visit_load_constant(ctx, instr);
+      break;
+   case nir_intrinsic_vulkan_resource_index:
+      visit_load_resource(ctx, instr);
+      break;
+   case nir_intrinsic_discard:
+      visit_discard(ctx, instr);
+      break;
+   case nir_intrinsic_discard_if:
+      visit_discard_if(ctx, instr);
+      break;
+   case nir_intrinsic_load_shared:
+      visit_load_shared(ctx, instr);
+      break;
+   case nir_intrinsic_store_shared:
+      visit_store_shared(ctx, instr);
+      break;
+   case nir_intrinsic_shared_atomic_add:
+   case nir_intrinsic_shared_atomic_imin:
+   case nir_intrinsic_shared_atomic_umin:
+   case nir_intrinsic_shared_atomic_imax:
+   case nir_intrinsic_shared_atomic_umax:
+   case nir_intrinsic_shared_atomic_and:
+   case nir_intrinsic_shared_atomic_or:
+   case nir_intrinsic_shared_atomic_xor:
+   case nir_intrinsic_shared_atomic_exchange:
+   case nir_intrinsic_shared_atomic_comp_swap:
+      visit_shared_atomic(ctx, instr);
+      break;
+   case nir_intrinsic_image_deref_load:
+      visit_image_load(ctx, instr);
+      break;
+   case nir_intrinsic_image_deref_store:
+      visit_image_store(ctx, instr);
+      break;
+   case nir_intrinsic_image_deref_atomic_add:
+   case nir_intrinsic_image_deref_atomic_umin:
+   case nir_intrinsic_image_deref_atomic_imin:
+   case nir_intrinsic_image_deref_atomic_umax:
+   case nir_intrinsic_image_deref_atomic_imax:
+   case nir_intrinsic_image_deref_atomic_and:
+   case nir_intrinsic_image_deref_atomic_or:
+   case nir_intrinsic_image_deref_atomic_xor:
+   case nir_intrinsic_image_deref_atomic_exchange:
+   case nir_intrinsic_image_deref_atomic_comp_swap:
+      visit_image_atomic(ctx, instr);
+      break;
+   case nir_intrinsic_image_deref_size:
+      visit_image_size(ctx, instr);
+      break;
+   case nir_intrinsic_load_ssbo:
+      visit_load_ssbo(ctx, instr);
+      break;
+   case nir_intrinsic_store_ssbo:
+      visit_store_ssbo(ctx, instr);
+      break;
+   case nir_intrinsic_load_global:
+      visit_load_global(ctx, instr);
+      break;
+   case nir_intrinsic_store_global:
+      visit_store_global(ctx, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_add:
+   case nir_intrinsic_ssbo_atomic_imin:
+   case nir_intrinsic_ssbo_atomic_umin:
+   case nir_intrinsic_ssbo_atomic_imax:
+   case nir_intrinsic_ssbo_atomic_umax:
+   case nir_intrinsic_ssbo_atomic_and:
+   case nir_intrinsic_ssbo_atomic_or:
+   case nir_intrinsic_ssbo_atomic_xor:
+   case nir_intrinsic_ssbo_atomic_exchange:
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+      visit_atomic_ssbo(ctx, instr);
+      break;
+   case nir_intrinsic_load_scratch:
+      visit_load_scratch(ctx, instr);
+      break;
+   case nir_intrinsic_store_scratch:
+      visit_store_scratch(ctx, instr);
+      break;
+   case nir_intrinsic_get_buffer_size:
+      visit_get_buffer_size(ctx, instr);
+      break;
+   case nir_intrinsic_barrier: {
+      unsigned* bsize = ctx->program->info->cs.block_size;
+      unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
+      if (workgroup_size > 64)
+         bld.sopp(aco_opcode::s_barrier);
+      break;
+   }
+   case nir_intrinsic_group_memory_barrier:
+   case nir_intrinsic_memory_barrier:
+   case nir_intrinsic_memory_barrier_atomic_counter:
+   case nir_intrinsic_memory_barrier_buffer:
+   case nir_intrinsic_memory_barrier_image:
+   case nir_intrinsic_memory_barrier_shared:
+      emit_memory_barrier(ctx, instr);
+      break;
+   case nir_intrinsic_load_num_work_groups:
+   case nir_intrinsic_load_work_group_id:
+   case nir_intrinsic_load_local_invocation_id: {
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      Temp* ids;
+      if (instr->intrinsic == nir_intrinsic_load_num_work_groups)
+         ids = ctx->num_workgroups;
+      else if (instr->intrinsic == nir_intrinsic_load_work_group_id)
+         ids = ctx->workgroup_ids;
+      else
+         ids = ctx->local_invocation_ids;
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+                 ids[0].id() ? Operand(ids[0]) : Operand(1u),
+                 ids[1].id() ? Operand(ids[1]) : Operand(1u),
+                 ids[2].id() ? Operand(ids[2]) : Operand(1u));
+      emit_split_vector(ctx, dst, 3);
+      break;
+   }
+   case nir_intrinsic_load_local_invocation_index: {
+      Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
+                         bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
+      Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
+      bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
+      break;
+   }
+   case nir_intrinsic_load_subgroup_id: {
+      if (ctx->stage == compute_cs) {
+         Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
+         bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
+      } else {
+         bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
+      }
+      break;
+   }
+   case nir_intrinsic_load_subgroup_invocation: {
+      bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
+               bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
+      break;
+   }
+   case nir_intrinsic_load_num_subgroups: {
+      if (ctx->stage == compute_cs)
+         bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size);
+      else
+         bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
+      break;
+   }
+   case nir_intrinsic_ballot: {
+      Definition tmp = bld.def(s2);
+      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+      if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) {
+         bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
+      } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) {
+         bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src));
+      } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
+         bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
+      } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
+         bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
+      break;
+   }
+   case nir_intrinsic_shuffle: {
+      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+         emit_uniform_subgroup(ctx, instr, src);
+      } else {
+         Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
+         assert(tid.regClass() == v1);
+         Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+         if (src.regClass() == v1) {
+            tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
+            emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst);
+         } else if (src.regClass() == v2) {
+            tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
+
+            Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+            bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+            lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo));
+            hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi));
+            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+            emit_split_vector(ctx, dst, 2);
+         } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
+            Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
+            tmp = emit_extract_vector(ctx, tmp, 0, v1);
+            tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
+            emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
+         } else {
+            fprintf(stderr, "Unimplemented NIR instr bit size: ");
+            nir_print_instr(&instr->instr, stderr);
+            fprintf(stderr, "\n");
+         }
+      }
+      break;
+   }
+   case nir_intrinsic_load_sample_id: {
+      bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               ctx->fs_inputs[ancillary], Operand(8u), Operand(4u));
+      break;
+   }
+   case nir_intrinsic_load_sample_mask_in: {
+      visit_load_sample_mask_in(ctx, instr);
+      break;
+   }
+   case nir_intrinsic_read_first_invocation: {
+      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      if (src.regClass() == v1) {
+         emit_wqm(ctx,
+                  bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
+                  dst);
+      } else if (src.regClass() == v2) {
+         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+         lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
+         hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+         emit_split_vector(ctx, dst, 2);
+      } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
+         emit_wqm(ctx,
+                  bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
+                           bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))),
+                  dst);
+      } else if (src.regClass() == s1) {
+         bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
+      } else if (src.regClass() == s2) {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_intrinsic_read_invocation: {
+      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+      Temp lane = get_ssa_temp(ctx, instr->src[1].ssa);
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      assert(lane.regClass() == s1);
+      if (src.regClass() == v1) {
+         emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst);
+      } else if (src.regClass() == v2) {
+         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+         lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane));
+         hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+         emit_split_vector(ctx, dst, 2);
+      } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
+         emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst);
+      } else if (src.regClass() == s1) {
+         bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
+      } else if (src.regClass() == s2) {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
+      } else {
+         fprintf(stderr, "Unimplemented NIR instr bit size: ");
+         nir_print_instr(&instr->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      break;
+   }
+   case nir_intrinsic_vote_all: {
+      Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      assert(src.regClass() == s2);
+      assert(dst.regClass() == s1);
+
+      Definition tmp = bld.def(s1);
+      bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp),
+               bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)),
+               Operand(exec, s2));
+      emit_wqm(ctx, tmp.getTemp(), dst);
+      break;
+   }
+   case nir_intrinsic_vote_any: {
+      Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      assert(src.regClass() == s2);
+      assert(dst.regClass() == s1);
+
+      Definition tmp = bld.def(s1);
+      bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2));
+      emit_wqm(ctx, tmp.getTemp(), dst);
+      break;
+   }
+   case nir_intrinsic_reduce:
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_exclusive_scan: {
+      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
+      unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
+         nir_intrinsic_cluster_size(instr) : 0;
+      cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
+
+      if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
+         emit_uniform_subgroup(ctx, instr, src);
+      } else if (instr->dest.ssa.bit_size == 1) {
+         if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
+            op = nir_op_iand;
+         else if (op == nir_op_iadd)
+            op = nir_op_ixor;
+         else if (op == nir_op_umax || op == nir_op_imax)
+            op = nir_op_ior;
+         assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
+
+         switch (instr->intrinsic) {
+         case nir_intrinsic_reduce:
+            emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
+            break;
+         case nir_intrinsic_exclusive_scan:
+            emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
+            break;
+         case nir_intrinsic_inclusive_scan:
+            emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
+            break;
+         default:
+            assert(false);
+         }
+      } else if (cluster_size == 1) {
+         bld.copy(Definition(dst), src);
+      } else {
+         src = as_vgpr(ctx, src);
+
+         ReduceOp reduce_op;
+         switch (op) {
+         #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
+            CASE(iadd)
+            CASE(imul)
+            CASE(fadd)
+            CASE(fmul)
+            CASE(imin)
+            CASE(umin)
+            CASE(fmin)
+            CASE(imax)
+            CASE(umax)
+            CASE(fmax)
+            CASE(iand)
+            CASE(ior)
+            CASE(ixor)
+            default:
+               unreachable("unknown reduction op");
+         #undef CASE
+         }
+
+         aco_opcode aco_op;
+         switch (instr->intrinsic) {
+            case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
+            case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
+            case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
+            default:
+               unreachable("unknown reduce intrinsic");
+         }
+
+         aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
+         reduce->operands[0] = Operand(src);
+         // filled in by aco_reduce_assign.cpp, used internally as part of the
+         // reduce sequence
+         assert(dst.size() == 1 || dst.size() == 2);