aco/wave32: Use lane mask regclass for exec/vcc.

author Timur Kristóf <timur.kristof@gmail.com>

Wed, 27 Nov 2019 10:04:47 +0000 (11:04 +0100)

committer Daniel Schürmann <daniel@schuermann.dev>

Wed, 4 Dec 2019 10:36:01 +0000 (10:36 +0000)
author Timur Kristóf <timur.kristof@gmail.com>
Wed, 27 Nov 2019 10:04:47 +0000 (11:04 +0100)
committer Daniel Schürmann <daniel@schuermann.dev>
Wed, 4 Dec 2019 10:36:01 +0000 (10:36 +0000)
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py

index e70d9317b3f726e26ac0f3d0b0bac35a419d6329..ada0806f6a98ececdf49f2dcd7a6de50e3b1df21 100644 (file)
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -135,13 +135,14 @@ public:
     Program *program;
     bool use_iterator;
     bool start; // only when use_iterator == false
+   RegClass lm;
  
     std::vector<aco_ptr<Instruction>> *instructions;
     std::vector<aco_ptr<Instruction>>::iterator it;
  
-   Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), instructions(NULL) {}
-   Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), instructions(&block->instructions) {}
-   Builder(Program *pgm, std::vector<aco_ptr<Instruction>> *instrs) : program(pgm), use_iterator(false), start(false), instructions(instrs) {}
+   Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), lm(pgm->lane_mask), instructions(NULL) {}
+   Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(&block->instructions) {}
+   Builder(Program *pgm, std::vector<aco_ptr<Instruction>> *instrs) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(instrs) {}
  
     void moveEnd(Block *block) {
        instructions = &block->instructions;
@@ -265,17 +266,26 @@ public:
  
  % for fixed in ['m0', 'vcc', 'exec', 'scc']:
     Operand ${fixed}(Temp tmp) {
+       % if fixed == 'vcc' or fixed == 'exec':
+          assert(tmp.regClass() == lm);
+       % endif
         Operand op(tmp);
         op.setFixed(aco::${fixed});
         return op;
     }
  
     Definition ${fixed}(Definition def) {
+       % if fixed == 'vcc' or fixed == 'exec':
+          assert(def.regClass() == lm);
+       % endif
         def.setFixed(aco::${fixed});
         return def;
     }
  
     Definition hint_${fixed}(Definition def) {
+       % if fixed == 'vcc' or fixed == 'exec':
+          assert(def.regClass() == lm);
+       % endif
         def.setHint(aco::${fixed});
         return def;
     }
@@ -350,11 +360,11 @@ public:
        assert((post_ra || b.op.hasRegClass()) && b.op.regClass().type() == RegType::vgpr);
  
        if (!carry_in.op.isUndefined())
-         return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(s2)), a, b, carry_in);
+         return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(lm)), a, b, carry_in);
        else if (program->chip_class >= GFX10 && carry_out)
           return vop3(aco_opcode::v_add_co_u32_e64, Definition(dst), def(s2), a, b);
        else if (program->chip_class < GFX9 || carry_out)
-         return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(s2)), a, b);
+         return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(lm)), a, b);
        else
           return vop2(aco_opcode::v_add_u32, Definition(dst), a, b);
     }
@@ -407,6 +417,7 @@ public:
        }
        return insert(std::move(sub));
     }
+
  <%
  import itertools
  formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]),
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp

index 31ae5ca658cf20f05fa93ca021b8d1dea21cee1a..cbc0698096ba0f39decedeb83ddda81fa0d80877 100644 (file)
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -302,14 +302,15 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
        return;
     if (ctx.info[idx].exec.back().second & mask_type_global) {
        Temp exec_mask = ctx.info[idx].exec.back().first;
-      exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask);
+      exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask);
        ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
        return;
     }
     /* otherwise, the WQM mask should be one below the current mask */
     ctx.info[idx].exec.pop_back();
     assert(ctx.info[idx].exec.back().second & mask_type_wqm);
-   ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+   assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
+   ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
                                                  ctx.info[idx].exec.back().first);
  }
  
@@ -324,14 +325,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
         !(ctx.info[idx].exec.back().second & mask_type_loop)) {
        ctx.info[idx].exec.pop_back();
        assert(ctx.info[idx].exec.back().second & mask_type_exact);
-      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+      assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
+      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
                                                     ctx.info[idx].exec.back().first);
        return;
     }
     /* otherwise, we create an exact mask and push to the stack */
     Temp wqm = ctx.info[idx].exec.back().first;
-   Temp exact = bld.tmp(s2);
-   wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+   Temp exact = bld.tmp(bld.lm);
+   wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
                    bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm));
     ctx.info[idx].exec.back().first = wqm;
     ctx.info[idx].exec.emplace_back(exact, mask_type_exact);
@@ -359,7 +361,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
        } else {
           uint8_t mask = mask_type_global;
           if (ctx.program->needs_wqm) {
-            exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask));
+            exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask));
              mask |= mask_type_wqm;
           } else {
              mask |= mask_type_exact;
@@ -383,7 +385,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
           aco_ptr<Pseudo_instruction> phi;
           for (int i = 0; i < info.num_exec_masks - 1; i++) {
              phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
-            phi->definitions[0] = bld.def(s2);
+            phi->definitions[0] = bld.def(bld.lm);
              phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first);
              ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
           }
@@ -393,7 +395,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
        if (info.has_divergent_break) {
           /* this phi might be trivial but ensures a parallelcopy on the loop header */
           aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
-         phi->definitions[0] = bld.def(s2);
+         phi->definitions[0] = bld.def(bld.lm);
           phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
           ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
        }
@@ -401,9 +403,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
        /* create ssa name for loop active mask */
        aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
        if (info.has_divergent_continue)
-         phi->definitions[0] = bld.def(s2);
+         phi->definitions[0] = bld.def(bld.lm);
        else
-         phi->definitions[0] = bld.def(s2, exec);
+         phi->definitions[0] = bld.def(bld.lm, exec);
        phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first);
        Temp loop_active = bld.insert(std::move(phi));
  
@@ -423,7 +425,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
              i++;
           }
           uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
-         ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+         assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
+         ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
                                                      ctx.info[idx].exec.back().first), mask_type);
        }
  
@@ -480,7 +483,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
           } else {
              /* create phi for loop footer */
              aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
-            phi->definitions[0] = bld.def(s2);
+            phi->definitions[0] = bld.def(bld.lm);
              for (unsigned i = 0; i < phi->operands.size(); i++)
                 phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first);
              ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
@@ -510,7 +513,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
              transition_to_Exact(ctx, bld, idx);
        }
  
-      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+      assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
+      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
                                                     ctx.info[idx].exec.back().first);
  
        ctx.loop.pop_back();
@@ -536,7 +540,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
              continue;
           }
  
-         Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2),
+         Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm),
                                 ctx.info[preds[0]].exec[i].first,
                                 ctx.info[preds[1]].exec[i].first);
           uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
@@ -578,7 +582,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
  
     if (block->kind & block_kind_merge) {
        Temp restore = ctx.info[idx].exec.back().first;
-      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore);
+      assert(restore.size() == bld.lm.size());
+      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore);
     }
  
     return i;
@@ -589,7 +594,7 @@ void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instructi
     Operand offset = instr->operands[1];
     if (need_check) {
        /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
-      Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u));
+      Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u));
  
        if (offset.isLiteral())
           offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset);
@@ -665,7 +670,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
           assert(num);
           Operand cond = instr->operands[0];
           for (int i = num - 1; i >= 0; i--) {
-            Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+            Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                            ctx.info[block->index].exec[i].first, cond);
              if (i == num - 1) {
                 andn2->operands[0].setFixed(exec);
@@ -689,8 +694,9 @@ void process_instructions(exec_ctx& ctx, Block* block,
  
        if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) {
           Definition dst = instr->definitions[0];
+         assert(dst.size() == bld.lm.size());
           if (state == Exact) {
-            instr.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b64, Format::SOP1, 1, 1));
+            instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
              instr->operands[0] = Operand(0u);
              instr->definitions[0] = dst;
           } else {
@@ -710,7 +716,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
              assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial);
              assert(exact_mask.second & mask_type_exact);
  
-            instr.reset(create_instruction<SOP2_instruction>(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2));
+            instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
              instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */
              instr->operands[1] = Operand(exact_mask.first);
              instr->definitions[0] = dst;
@@ -726,8 +732,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
           if (instr->operands.empty()) {
              /* transition to exact and set exec to zero */
              Temp old_exec = ctx.info[block->index].exec.back().first;
-            Temp new_exec = bld.tmp(s2);
-            cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+            Temp new_exec = bld.tmp(bld.lm);
+            cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
                              bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
              if (ctx.info[block->index].exec.back().second & mask_type_exact) {
                 ctx.info[block->index].exec.back().first = new_exec;
@@ -746,7 +752,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
           num += ctx.info[block->index].exec.size() - 1;
           for (int i = num - 1; i >= 0; i--) {
              if (ctx.info[block->index].exec[i].second & mask_type_exact) {
-               Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+               Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                               ctx.info[block->index].exec[i].first, cond);
                 if (i == num - 1) {
                    andn2->operands[0].setFixed(exec);
@@ -878,13 +884,13 @@ void add_branch_code(exec_ctx& ctx, Block* block)
        }
  
        Temp old_exec = ctx.info[idx].exec.back().first;
-      Temp new_exec = bld.tmp(s2);
-      Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+      Temp new_exec = bld.tmp(bld.lm);
+      Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
                             bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
        ctx.info[idx].exec.back().first = new_exec;
  
        for (int i = num - 1; i >= 0; i--) {
-         Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+         Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                         ctx.info[block->index].exec[i].first, cond);
           if (i == (int)ctx.info[idx].exec.size() - 1)
              andn2->definitions[0].setFixed(exec);
@@ -912,7 +918,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
           Temp cond = Temp();
           for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) {
              if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) {
-               cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
+               cond = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
                 break;
              }
           }
@@ -957,8 +963,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
        Temp current_exec = ctx.info[idx].exec.back().first;
        uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
  
-      Temp then_mask = bld.tmp(s2);
-      Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+      Temp then_mask = bld.tmp(bld.lm);
+      Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
                                 bld.exec(Definition(then_mask)), cond, bld.exec(current_exec));
  
        ctx.info[idx].exec.back().first = old_exec;
@@ -978,7 +984,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
        uint8_t mask_type = ctx.info[idx].exec.back().second;
        ctx.info[idx].exec.pop_back();
        Temp orig_exec = ctx.info[idx].exec.back().first;
-      Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec),
+      Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec),
                                  bld.def(s1, scc), orig_exec, bld.exec(then_mask));
  
        /* add next current exec to the stack */
@@ -998,7 +1004,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
        for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
           cond = bld.tmp(s1);
           Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
-         exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
+         exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
                                exec_mask, current_exec);
           ctx.info[idx].exec[exec_idx].first = exec_mask;
           if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
@@ -1010,7 +1016,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
        unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
        Block& succ = ctx.program->blocks[succ_idx];
        if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
-         ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
+         ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
        }
  
        bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
@@ -1028,7 +1034,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
              break;
           cond = bld.tmp(s1);
           Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
-         exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
+         exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
                                exec_mask, bld.exec(current_exec));
           ctx.info[idx].exec[exec_idx].first = exec_mask;
        }
@@ -1039,7 +1045,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
        unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
        Block& succ = ctx.program->blocks[succ_idx];
        if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
-         ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
+         ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
        }
  
        bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index a2b2c21170cf703ae170d61a1e1358312e4c8521..9de9d5dec140e2deffd5f47a66f45fadaad0b3f6 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -351,12 +351,12 @@ Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2
  {
     Builder bld(ctx->program, ctx->block);
     if (!dst.id())
-      dst = bld.tmp(s2);
+      dst = bld.tmp(bld.lm);
  
     assert(val.regClass() == s1);
-   assert(dst.regClass() == s2);
+   assert(dst.regClass() == bld.lm);
  
-   return bld.sop2(aco_opcode::s_cselect_b64, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
+   return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
  }
  
  Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
@@ -365,12 +365,12 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1
     if (!dst.id())
        dst = bld.tmp(s1);
  
-   assert(val.regClass() == s2);
+   assert(val.regClass() == bld.lm);
     assert(dst.regClass() == s1);
  
     /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
     Temp tmp = bld.tmp(s1);
-   bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(Definition(tmp)), val, Operand(exec, s2));
+   bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
     return emit_wqm(ctx, tmp, dst);
  }
  
@@ -489,6 +489,8 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
  {
     Temp src0 = get_alu_src(ctx, instr->src[0]);
     Temp src1 = get_alu_src(ctx, instr->src[1]);
+   assert(src0.size() == src1.size());
+
     aco_ptr<Instruction> vopc;
     if (src1.type() == RegType::sgpr) {
        if (src0.type() == RegType::vgpr) {
@@ -549,12 +551,13 @@ void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
  {
     Temp src0 = get_alu_src(ctx, instr->src[0]);
     Temp src1 = get_alu_src(ctx, instr->src[1]);
+   Builder bld(ctx->program, ctx->block);
  
-   assert(dst.regClass() == s2);
+   assert(dst.regClass() == bld.lm);
     assert(src0.type() == RegType::sgpr);
     assert(src1.type() == RegType::sgpr);
+   assert(src0.regClass() == src1.regClass());
  
-   Builder bld(ctx->program, ctx->block);
     /* Emit the SALU comparison instruction */
     Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
     /* Turn the result into a per-lane bool */
@@ -580,17 +583,17 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
        emit_sopc_instruction(ctx, instr, op, dst);
  }
  
-void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op64, Temp dst)
+void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
  {
     Builder bld(ctx->program, ctx->block);
     Temp src0 = get_alu_src(ctx, instr->src[0]);
     Temp src1 = get_alu_src(ctx, instr->src[1]);
  
-   assert(dst.regClass() == s2);
-   assert(src0.regClass() == s2);
-   assert(src1.regClass() == s2);
+   assert(dst.regClass() == bld.lm);
+   assert(src0.regClass() == bld.lm);
+   assert(src1.regClass() == bld.lm);
  
-   bld.sop2(op64, Definition(dst), bld.def(s1, scc), src0, src1);
+   bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
  }
  
  void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
@@ -600,7 +603,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
     Temp then = get_alu_src(ctx, instr->src[1]);
     Temp els = get_alu_src(ctx, instr->src[2]);
  
-   assert(cond.regClass() == s2);
+   assert(cond.regClass() == bld.lm);
  
     if (dst.type() == RegType::vgpr) {
        aco_ptr<Instruction> bcsel;
@@ -628,14 +631,15 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
     }
  
     if (instr->dest.dest.ssa.bit_size == 1) {
-      assert(dst.regClass() == s2);
-      assert(then.regClass() == s2);
-      assert(els.regClass() == s2);
+      assert(dst.regClass() == bld.lm);
+      assert(then.regClass() == bld.lm);
+      assert(els.regClass() == bld.lm);
     }
  
     if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
        if (dst.regClass() == s1 || dst.regClass() == s2) {
           assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
+         assert(dst.size() == then.size());
           aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
           bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
        } else {
@@ -652,20 +656,20 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
     assert(instr->dest.dest.ssa.bit_size == 1);
  
     if (cond.id() != then.id())
-      then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
+      then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
  
     if (cond.id() == els.id())
-      bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
+      bld.sop1(Builder::s_mov, Definition(dst), then);
     else
-      bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
-               bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
+      bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
+               bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
  }
  
  void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
                      aco_opcode op, uint32_t undo)
  {
     /* multiply by 16777216 to handle denormals */
-   Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(s2)),
+   Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
                                 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
     Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
     scaled = bld.vop1(op, bld.def(v1), scaled);
@@ -766,9 +770,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_inot: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->dest.dest.ssa.bit_size == 1) {
-         assert(src.regClass() == s2);
-         assert(dst.regClass() == s2);
-         bld.sop2(aco_opcode::s_andn2_b64, Definition(dst), bld.def(s1, scc), Operand(exec, s2), src);
+         assert(src.regClass() == bld.lm);
+         assert(dst.regClass() == bld.lm);
+         bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
        } else if (dst.type() == RegType::sgpr) {
@@ -835,12 +839,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
        } else if (dst.regClass() == v1) {
           Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
-         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
        } else if (dst.regClass() == v2) {
           Temp upper = emit_extract_vector(ctx, src, 1, v1);
           Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
-         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
           upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
@@ -901,7 +905,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_ior: {
        if (instr->dest.dest.ssa.bit_size == 1) {
-         emit_boolean_logic(ctx, instr, aco_opcode::s_or_b64, dst);
+         emit_boolean_logic(ctx, instr, Builder::s_or, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
        } else if (dst.regClass() == s1) {
@@ -917,7 +921,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_iand: {
        if (instr->dest.dest.ssa.bit_size == 1) {
-         emit_boolean_logic(ctx, instr, aco_opcode::s_and_b64, dst);
+         emit_boolean_logic(ctx, instr, Builder::s_and, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
        } else if (dst.regClass() == s1) {
@@ -933,7 +937,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_ixor: {
        if (instr->dest.dest.ssa.bit_size == 1) {
-         emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst);
+         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
        } else if (dst.regClass() == s1) {
@@ -1709,16 +1713,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fsign: {
        Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
        if (dst.size() == 1) {
-         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
-         cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
        } else if (dst.size() == 2) {
-         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
           Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
  
-         cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+         cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
           upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
  
@@ -1922,7 +1926,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
           Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
-         Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
+         Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
           exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
           Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
           mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
@@ -1986,7 +1990,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_b2f32: {
        Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(src.regClass() == s2);
+      assert(src.regClass() == bld.lm);
  
        if (dst.regClass() == s1) {
           src = bool_to_scalar_condition(ctx, src);
@@ -2000,7 +2004,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_b2f64: {
        Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(src.regClass() == s2);
+      assert(src.regClass() == bld.lm);
  
        if (dst.regClass() == s2) {
           src = bool_to_scalar_condition(ctx, src);
@@ -2073,7 +2077,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_b2i32: {
        Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(src.regClass() == s2);
+      assert(src.regClass() == bld.lm);
  
        if (dst.regClass() == s1) {
           // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
@@ -2087,7 +2091,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_i2b1: {
        Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(dst.regClass() == s2);
+      assert(dst.regClass() == bld.lm);
  
        if (src.type() == RegType::vgpr) {
           assert(src.regClass() == v1 || src.regClass() == v2);
@@ -2164,7 +2168,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  
        Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
  
-      Temp cmp_res = bld.tmp(s2);
+      Temp cmp_res = bld.tmp(bld.lm);
        bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
  
        Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
@@ -2338,14 +2342,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_ieq: {
        if (instr->src[0].src.ssa->bit_size == 1)
-         emit_boolean_logic(ctx, instr, aco_opcode::s_xnor_b64, dst);
+         emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
        else
           emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64);
        break;
     }
     case nir_op_ine: {
        if (instr->src[0].src.ssa->bit_size == 1)
-         emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst);
+         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
        else
           emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64);
        break;
@@ -2405,8 +2409,10 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
     Builder bld(ctx->program, ctx->block);
  
     if (instr->def.bit_size == 1) {
-      assert(dst.regClass() == s2);
-      bld.sop1(aco_opcode::s_mov_b64, Definition(dst), Operand((uint64_t)(instr->value[0].b ? -1 : 0)));
+      assert(dst.regClass() == bld.lm);
+      int val = instr->value[0].b ? -1 : 0;
+      Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
+      bld.sop1(Builder::s_mov, Definition(dst), op);
     } else if (dst.size() == 1) {
        bld.copy(Definition(dst), Operand(instr->value[0].u32));
     } else {
@@ -3033,7 +3039,7 @@ Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alph
     /* Convert back to the right type. */
     if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
        alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
-      Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
+      Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
        alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
     } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
        alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
@@ -3599,8 +3605,8 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
     // TODO: optimize uniform conditions
     Builder bld(ctx->program, ctx->block);
     Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-   assert(src.regClass() == s2);
-   src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+   assert(src.regClass() == bld.lm);
+   src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
     bld.pseudo(aco_opcode::p_discard_if, src);
     ctx->block->kind |= block_kind_uses_discard_if;
     return;
@@ -3663,7 +3669,7 @@ void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
        ctx->program->needs_exact = true;
        /* save exec somewhere temporarily so that it doesn't get
         * overwritten before the discard from outer exec masks */
-      Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
+      Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
        bld.pseudo(aco_opcode::p_discard_if, cond);
        ctx->block->kind |= block_kind_uses_discard_if;
        return;
@@ -3950,7 +3956,7 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo
     /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
      * resource descriptor is 0 (invalid),
      */
-   Temp compare = bld.tmp(s2);
+   Temp compare = bld.tmp(bld.lm);
     bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
                  Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
  
@@ -4739,12 +4745,12 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
        if (offset > 0 && ctx->options->chip_class < GFX9) {
           Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
           Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
-         Temp carry = bld.tmp(s2);
+         Temp carry = bld.tmp(bld.lm);
           bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
  
           bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
                    Operand(offset), addr0);
-         bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
+         bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
                    Operand(0u), addr1,
                    carry).def(1).setHint(vcc);
  
@@ -5219,25 +5225,25 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te
        return src;
     } if (op == nir_op_iand && cluster_size == 4) {
        //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
-      Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
-      return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
-                      bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
+      Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
+      return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
+                      bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
     } else if (op == nir_op_ior && cluster_size == 4) {
        //subgroupClusteredOr(val, 4) -> wqm(val & exec)
-      return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
-                      bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
+      return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
+                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
     } else if (op == nir_op_iand && cluster_size == 64) {
        //subgroupAnd(val) -> (exec & ~val) == 0
-      Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
-      return bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
+      Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
+      return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
     } else if (op == nir_op_ior && cluster_size == 64) {
        //subgroupOr(val) -> (val & exec) != 0
-      Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
+      Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
        return bool_to_vector_condition(ctx, tmp);
     } else if (op == nir_op_ixor && cluster_size == 64) {
        //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
-      Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
-      tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s1), bld.def(s1, scc), tmp);
+      Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
+      tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
        tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
        return bool_to_vector_condition(ctx, tmp);
     } else {
@@ -5256,25 +5262,28 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te
  
        Temp tmp;
        if (op == nir_op_iand)
-         tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+         tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
        else
-         tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+         tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
  
        uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
-      tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
+      if (ctx->program->wave_size == 64)
+         tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
+      else
+         tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
        tmp = emit_extract_vector(ctx, tmp, 0, v1);
        if (cluster_mask != 0xffffffff)
           tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
  
        Definition cmp_def = Definition();
        if (op == nir_op_iand) {
-         cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
+         cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
        } else if (op == nir_op_ior) {
-         cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
+         cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
        } else if (op == nir_op_ixor) {
           tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
                          bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
-         cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
+         cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
        }
        cmp_def.setHint(vcc);
        return cmp_def.getTemp();
@@ -5290,9 +5299,9 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
     //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
     Temp tmp;
     if (op == nir_op_iand)
-      tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
+      tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
     else
-      tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+      tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
  
     Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
     Temp lo = lohi.def(0).getTemp();
@@ -5301,11 +5310,11 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
  
     Definition cmp_def = Definition();
     if (op == nir_op_iand)
-      cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
+      cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
     else if (op == nir_op_ior)
-      cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
+      cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
     else if (op == nir_op_ixor)
-      cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
+      cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
                           bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
     cmp_def.setHint(vcc);
     return cmp_def.getTemp();
@@ -5320,11 +5329,11 @@ Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
     //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
     Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
     if (op == nir_op_iand)
-      return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+      return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
     else if (op == nir_op_ior)
-      return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+      return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
     else if (op == nir_op_ixor)
-      return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+      return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
  
     assert(false);
     return Temp();
@@ -5453,7 +5462,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           Temp pck0 = bld.tmp(v1);
           Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
           tmp1 = as_vgpr(ctx, tmp1);
-         Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
+         Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
           addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
  
           /* sample_pos = flat_load_dwordx2 addr */
@@ -5685,11 +5694,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        break;
     }
     case nir_intrinsic_ballot: {
-      Definition tmp = bld.def(s2);
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      Definition tmp = bld.def(dst.regClass());
        if (instr->src[0].ssa->bit_size == 1) {
-         assert(src.regClass() == s2);
-         bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
+         assert(src.regClass() == bld.lm);
+         bld.sop2(Builder::s_and, tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
        } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
           bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
        } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
@@ -5699,7 +5709,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           nir_print_instr(&instr->instr, stderr);
           fprintf(stderr, "\n");
        }
-      emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
+      emit_wqm(ctx, tmp.getTemp(), dst);
        break;
     }
     case nir_intrinsic_shuffle:
@@ -5722,15 +5732,19 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
              bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
              emit_split_vector(ctx, dst, 2);
           } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
-            assert(src.regClass() == s2);
-            Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid);
+            assert(src.regClass() == bld.lm);
+            Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
              bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
           } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
-            assert(src.regClass() == s2);
-            Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
+            assert(src.regClass() == bld.lm);
+            Temp tmp;
+            if (ctx->program->wave_size == 64)
+               tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
+            else
+               tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
              tmp = emit_extract_vector(ctx, tmp, 0, v1);
              tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
-            emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
+            emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
           } else {
              fprintf(stderr, "Unimplemented NIR instr bit size: ");
              nir_print_instr(&instr->instr, stderr);
@@ -5763,9 +5777,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
           emit_split_vector(ctx, dst, 2);
        } else if (instr->dest.ssa.bit_size == 1) {
-         assert(src.regClass() == s2);
-         Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
-                             bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)));
+         assert(src.regClass() == bld.lm);
+         Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
+                             bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
           bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
        } else if (src.regClass() == s1) {
           bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
@@ -5781,22 +5795,22 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_vote_all: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      assert(src.regClass() == s2);
-      assert(dst.regClass() == s2);
+      assert(src.regClass() == bld.lm);
+      assert(dst.regClass() == bld.lm);
  
-      Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
-      Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
+      Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
+      Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
        emit_wqm(ctx, val, dst);
        break;
     }
     case nir_intrinsic_vote_any: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      assert(src.regClass() == s2);
-      assert(dst.regClass() == s2);
+      assert(src.regClass() == bld.lm);
+      assert(dst.regClass() == bld.lm);
  
-      Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
-      Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(-1u), Operand(0u), bld.scc(tmp));
+      Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
+      Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), Operand(0u), bld.scc(tmp));
        emit_wqm(ctx, val, dst);
        break;
     }
@@ -5879,7 +5893,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
  
           Temp tmp_dst = bld.tmp(dst.regClass());
           reduce->definitions[0] = Definition(tmp_dst);
-         reduce->definitions[1] = bld.def(s2); // used internally
+         reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
           reduce->definitions[2] = Definition();
           reduce->definitions[3] = Definition(scc, s1);
           reduce->definitions[4] = Definition();
@@ -5899,13 +5913,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
           unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
           if (instr->dest.ssa.bit_size == 1) {
-            assert(src.regClass() == s2);
+            assert(src.regClass() == bld.lm);
+            assert(dst.regClass() == bld.lm);
              uint32_t half_mask = 0x11111111u << lane;
              Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
-            Temp tmp = bld.tmp(s2);
-            bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
-                     bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
-                              bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
+            Temp tmp = bld.tmp(bld.lm);
+            bld.sop1(Builder::s_wqm, Definition(tmp),
+                     bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
+                              bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
              emit_wqm(ctx, tmp, dst);
           } else if (instr->dest.ssa.bit_size == 32) {
              emit_wqm(ctx,
@@ -5957,10 +5972,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
  
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
        if (instr->dest.ssa.bit_size == 1) {
-         assert(src.regClass() == s2);
+         assert(src.regClass() == bld.lm);
           src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
           src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
-         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
+         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
           emit_wqm(ctx, tmp, dst);
        } else if (instr->dest.ssa.bit_size == 32) {
           Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
@@ -6060,15 +6075,15 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        break;
     case nir_intrinsic_demote_if: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      assert(src.regClass() == s2);
-      Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+      assert(src.regClass() == bld.lm);
+      Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
        bld.pseudo(aco_opcode::p_demote_to_helper, cond);
        ctx->block->kind |= block_kind_uses_demote;
        ctx->program->needs_exact = true;
        break;
     }
     case nir_intrinsic_first_invocation: {
-      emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
+      emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
                 get_ssa_temp(ctx, &instr->dest.ssa));
        break;
     }
@@ -6180,14 +6195,14 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
     Operand two(0x40000000u);
     Operand four(0x40800000u);
  
-   Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
+   Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
     Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
     Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
  
-   Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
+   Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
     Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
-   is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
-   Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
+   is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
+   Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
  
     // select sc
     Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
@@ -6667,7 +6682,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
                              Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
                              bld.scc(compare_cube_wa));
           }
-         tg4_compare_cube_wa64 = bld.tmp(s2);
+         tg4_compare_cube_wa64 = bld.tmp(bld.lm);
           bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
  
           nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
@@ -6800,7 +6815,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
           assert(dmask == 1 && dst.regClass() == v1);
           assert(dst.id() != tmp_dst.id());
  
-         Temp tmp = bld.tmp(s2);
+         Temp tmp = bld.tmp(bld.lm);
           bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
           bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
  
@@ -6921,7 +6936,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
  {
     aco_ptr<Pseudo_instruction> phi;
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-   assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == s2);
+   assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
  
     bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
     logical |= ctx->block->kind & block_kind_merge;
@@ -7295,7 +7310,7 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond
     ctx->block->kind |= block_kind_branch;
  
     /* branch to linear then block */
-   assert(cond.regClass() == s2);
+   assert(cond.regClass() == ctx->program->lane_mask);
     aco_ptr<Pseudo_branch_instruction> branch;
     branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
     branch->operands[0] = Operand(cond);
@@ -7439,7 +7454,7 @@ static void visit_if(isel_context *ctx, nir_if *if_stmt)
        ctx->block->kind |= block_kind_uniform;
  
        /* emit branch */
-      assert(cond.regClass() == s2);
+      assert(cond.regClass() == bld.lm);
        // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
        cond = bool_to_scalar_condition(ctx, cond);
  
@@ -7825,7 +7840,7 @@ void handle_bc_optimize(isel_context *ctx)
     ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
     ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
     if (uses_center && uses_centroid) {
-      Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)),
+      Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
                                get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
  
        if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
@@ -7934,7 +7949,7 @@ void select_program(Program *program,
           Builder bld(ctx.program, ctx.block);
           Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
           Temp thread_id = emit_mbcnt(&ctx, bld.def(v1));
-         Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
+         Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id);
  
           begin_divergent_if_then(&ctx, &ic, cond);
        }
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp

index ab96a4507cfa94ddb82ffd0981bf23a1772d6385..a7446c6c058ad2dc68cb0e815cbeeb0856c58693 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -126,6 +126,7 @@ unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
  void init_context(isel_context *ctx, nir_shader *shader)
  {
     nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   unsigned lane_mask_size = ctx->program->lane_mask.size();
  
     ctx->shader = shader;
     ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform);
@@ -207,7 +208,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
                    case nir_op_ieq:
                    case nir_op_ine:
                    case nir_op_i2b1:
-                     size = 2;
+                     size = lane_mask_size;
                       break;
                    case nir_op_f2i64:
                    case nir_op_f2u64:
@@ -219,7 +220,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
                       break;
                    case nir_op_bcsel:
                       if (alu_instr->dest.dest.ssa.bit_size == 1) {
-                        size = 2;
+                        size = lane_mask_size;
                       } else {
                          if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) {
                             type = RegType::vgpr;
@@ -237,14 +238,14 @@ void init_context(isel_context *ctx, nir_shader *shader)
                       break;
                    case nir_op_mov:
                       if (alu_instr->dest.dest.ssa.bit_size == 1) {
-                        size = 2;
+                        size = lane_mask_size;
                       } else {
                          type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
                       }
                       break;
                    default:
                       if (alu_instr->dest.dest.ssa.bit_size == 1) {
-                        size = 2;
+                        size = lane_mask_size;
                       } else {
                          for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
                             if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
@@ -261,7 +262,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
                 if (nir_instr_as_load_const(instr)->def.bit_size == 64)
                    size *= 2;
                 else if (nir_instr_as_load_const(instr)->def.bit_size == 1)
-                  size *= 2;
+                  size *= lane_mask_size;
                 allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size));
                 break;
              }
@@ -289,11 +290,11 @@ void init_context(isel_context *ctx, nir_shader *shader)
                    case nir_intrinsic_first_invocation:
                       type = RegType::sgpr;
                       if (intrinsic->dest.ssa.bit_size == 1)
-                        size = 2;
+                        size = lane_mask_size;
                       break;
                    case nir_intrinsic_ballot:
                       type = RegType::sgpr;
-                     size = 2;
+                     size = lane_mask_size;
                       break;
                    case nir_intrinsic_load_sample_id:
                    case nir_intrinsic_load_sample_mask_in:
@@ -369,7 +370,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
                    case nir_intrinsic_inclusive_scan:
                    case nir_intrinsic_exclusive_scan:
                       if (intrinsic->dest.ssa.bit_size == 1) {
-                        size = 2;
+                        size = lane_mask_size;
                          type = RegType::sgpr;
                       } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) {
                          type = RegType::sgpr;
@@ -384,11 +385,11 @@ void init_context(isel_context *ctx, nir_shader *shader)
                    case nir_intrinsic_load_helper_invocation:
                    case nir_intrinsic_is_helper_invocation:
                       type = RegType::sgpr;
-                     size = 2;
+                     size = lane_mask_size;
                       break;
                    case nir_intrinsic_reduce:
                       if (intrinsic->dest.ssa.bit_size == 1) {
-                        size = 2;
+                        size = lane_mask_size;
                          type = RegType::sgpr;
                       } else if (nir_intrinsic_cluster_size(intrinsic) == 0 ||
                           !ctx->divergent_vals[intrinsic->dest.ssa.index]) {
@@ -489,7 +490,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
                 if (phi->dest.ssa.bit_size == 1) {
                    assert(size == 1 && "multiple components not yet supported on boolean phis.");
                    type = RegType::sgpr;
-                  size *= 2;
+                  size *= lane_mask_size;
                    allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size));
                    break;
                 }
@@ -590,7 +591,7 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx)
        startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
        arg++;
     }
-   startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, s2};
+   startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, ctx->program->lane_mask};
     Pseudo_instruction *instr = startpgm.get();
     ctx->block->instructions.push_back(std::move(startpgm));
  
@@ -796,6 +797,7 @@ setup_isel_context(Program* program,
     program->chip_class = args->options->chip_class;
     program->family = args->options->family;
     program->wave_size = args->shader_info->wave_size;
+   program->lane_mask = program->wave_size == 32 ? s1 : s2;
  
     program->lds_alloc_granule = args->options->chip_class >= GFX7 ? 512 : 256;
     program->lds_limit = args->options->chip_class >= GFX7 ? 65536 : 32768;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h

index 4073086662ac5dd85cdb65587026a26aaa347ee0..1f4721f5ffdf31ca236644bdca39b68595054c8d 100644 (file)
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1149,6 +1149,7 @@ public:
     enum chip_class chip_class;
     enum radeon_family family;
     unsigned wave_size;
+   RegClass lane_mask;
     Stage stage; /* Stage */
     bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
     bool needs_wqm = false; /* there exists a p_wqm instruction */
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp

index 05ddb7bc68a69d3942a62c62d3515ccc86762d86..4255d56173be7e4ceb9f6efc40e967771066ac38 100644 (file)
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -54,7 +54,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
     bool exec_live = false;
     if (block->live_out_exec != Temp()) {
        live_sgprs.insert(block->live_out_exec);
-      new_demand.sgpr += 2;
+      new_demand.sgpr += program->lane_mask.size();
        exec_live = true;
     }
  
@@ -77,10 +77,10 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
        if (is_phi(insn))
           break;
  
-      /* substract the 2 sgprs from exec */
+      /* substract the 1 or 2 sgprs from exec */
        if (exec_live)
-         assert(new_demand.sgpr >= 2);
-      register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? 2 : 0));
+         assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
+      register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0));
  
        /* KILL */
        for (Definition& definition : insn->definitions) {
@@ -144,8 +144,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
  
     /* update block's register demand for a last time */
     if (exec_live)
-      assert(new_demand.sgpr >= 2);
-   new_demand.sgpr -= exec_live ? 2 : 0;
+      assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
+   new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0;
     block->register_demand.update(new_demand);
  
     /* handle phi definitions */
diff --git a/src/amd/compiler/aco_lower_bool_phis.cpp b/src/amd/compiler/aco_lower_bool_phis.cpp

index dc64f0133b535fb7e7464a9e3d72a5d3339562f3..988f753c82d14d49e71fe3db2fd0fb63eae28754 100644 (file)
--- a/src/amd/compiler/aco_lower_bool_phis.cpp
+++ b/src/amd/compiler/aco_lower_bool_phis.cpp
@@ -54,12 +54,12 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state)
     while (true) {
        auto pos = state->latest.find(block_idx);
        if (pos != state->latest.end())
-         return Operand({pos->second, s2});
+         return Operand({pos->second, program->lane_mask});
  
        Block& block = program->blocks[block_idx];
        size_t pred = block.linear_preds.size();
        if (pred == 0) {
-         return Operand(s2);
+         return Operand(program->lane_mask);
        } else if (pred == 1) {
           block_idx = block.linear_preds[0];
           continue;
@@ -75,10 +75,10 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state)
                 state->phis[phi->operands[i].tempId()][(phi_use){&block, res}] |= (uint64_t)1 << i;
              }
           }
-         phi->definitions[0] = Definition(Temp{res, s2});
+         phi->definitions[0] = Definition(Temp{res, program->lane_mask});
           block.instructions.emplace(block.instructions.begin(), std::move(phi));
  
-         return Operand({res, s2});
+         return Operand({res, program->lane_mask});
        }
     }
  }
@@ -118,7 +118,7 @@ Temp write_ssa(Program *program, Block *block, ssa_state *state, unsigned previo
           update_phi(program, state, phi.first.block, phi.first.phi_def, phi.second);
     }
  
-   return {id, s2};
+   return {id, program->lane_mask};
  }
  
  void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
@@ -150,23 +150,25 @@ void lower_divergent_bool_phi(Program *program, Block *block, aco_ptr<Instructio
  
        assert(phi->operands[i].isTemp());
        Temp phi_src = phi->operands[i].getTemp();
-      assert(phi_src.regClass() == s2);
+      assert(phi_src.regClass() == bld.lm);
  
        Operand cur = get_ssa(program, pred->index, &state);
+      assert(cur.regClass() == bld.lm);
        Temp new_cur = write_ssa(program, pred, &state, cur.isTemp() ? cur.tempId() : 0);
+      assert(new_cur.regClass() == bld.lm);
  
        if (cur.isUndefined()) {
           insert_before_logical_end(pred, bld.sop1(aco_opcode::s_mov_b64, Definition(new_cur), phi_src).get_ptr());
        } else {
-         Temp tmp1 = bld.tmp(s2), tmp2 = bld.tmp(s2);
+         Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm);
           insert_before_logical_end(pred,
-            bld.sop2(aco_opcode::s_andn2_b64, Definition(tmp1), bld.def(s1, scc),
-                     cur, Operand(exec, s2)).get_ptr());
+            bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc),
+                     cur, Operand(exec, bld.lm)).get_ptr());
           insert_before_logical_end(pred,
-            bld.sop2(aco_opcode::s_and_b64, Definition(tmp2), bld.def(s1, scc),
-                     phi_src, Operand(exec, s2)).get_ptr());
+            bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc),
+                     phi_src, Operand(exec, bld.lm)).get_ptr());
           insert_before_logical_end(pred,
-            bld.sop2(aco_opcode::s_or_b64, Definition(new_cur), bld.def(s1, scc),
+            bld.sop2(Builder::s_or, Definition(new_cur), bld.def(s1, scc),
                       tmp1, tmp2).get_ptr());
        }
     }
@@ -192,8 +194,8 @@ void lower_bool_phis(Program* program)
     for (Block& block : program->blocks) {
        for (aco_ptr<Instruction>& phi : block.instructions) {
           if (phi->opcode == aco_opcode::p_phi) {
-            assert(phi->definitions[0].regClass() != s1);
-            if (phi->definitions[0].regClass() == s2)
+            assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2);
+            if (phi->definitions[0].regClass() == program->lane_mask)
                 lower_divergent_bool_phi(program, &block, phi);
           } else if (!is_phi(phi)) {
              break;
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp

index cbb3b55179c3f850caae7eece1cdec40574f890a..e9c2d66d8233818922f820610563f09b7ff1b5ff 100644 (file)
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -76,8 +76,10 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
  void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
  {
     Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
-   if (instr->definitions.size() >= 2)
+   if (instr->definitions.size() >= 2) {
+      assert(instr->definitions[1].regClass() == bld.lm);
        instr->definitions[1].setFixed(vcc);
+   }
  }
  
  void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
@@ -99,12 +101,12 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph
              bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
           bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
                        dpp_ctrl, row_mask, bank_mask, bound_ctrl);
-         bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(s2, vcc), vtmp_op[0], src1[0]);
+         bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
        } else {
-         bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(s2, vcc), src0[0], src1[0],
+         bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
                        dpp_ctrl, row_mask, bank_mask, bound_ctrl);
        }
-      bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(s2, vcc), src0[1], src1[1], Operand(vcc, s2),
+      bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm),
                     dpp_ctrl, row_mask, bank_mask, bound_ctrl);
     } else if (op == iand64) {
        bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0],
@@ -149,9 +151,9 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph
        bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1],
                     dpp_ctrl, row_mask, bank_mask, bound_ctrl);
  
-      bld.vopc(cmp, bld.def(s2, vcc), vtmp_op64, src1_64);
-      bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, s2));
-      bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, s2));
+      bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
+      bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
+      bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
     } else if (op == imul64) {
        /* t4 = dpp(x_hi)
         * t1 = umul_lo(t4, y_lo)
@@ -216,11 +218,11 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
  
     if (op == iadd64) {
        if (ctx->program->chip_class >= GFX10) {
-         bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(s2, vcc), src0[0], src1[0]);
+         bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
        } else {
-         bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(s2, vcc), src0[0], src1[0]);
+         bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
        }
-      bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(s2, vcc), src0[1], src1[1], Operand(vcc, s2));
+      bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm));
     } else if (op == iand64) {
        bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
        bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
@@ -249,9 +251,9 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
           break;
        }
  
-      bld.vopc(cmp, bld.def(s2, vcc), src0_64, src1_64);
-      bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, s2));
-      bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, s2));
+      bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
+      bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
+      bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
     } else if (op == imul64) {
        if (src1_reg == dst_reg) {
           /* it's fine if src0==dst but not if src1==dst */
@@ -298,7 +300,7 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg
  
     if (!vop3) {
        if (opcode == aco_opcode::v_add_co_u32)
-         bld.vop2_dpp(opcode, dst, bld.def(s2, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+         bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
        else
           bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
        return;
@@ -342,7 +344,7 @@ void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1
     if (vop3) {
        bld.vop3(opcode, dst, src0, src1);
     } else if (opcode == aco_opcode::v_add_co_u32) {
-      bld.vop2(opcode, dst, bld.def(s2, vcc), src0, src1);
+      bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
     } else {
        bld.vop2(opcode, dst, src0, src1);
     }
@@ -420,7 +422,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
     Operand vcndmask_identity[2] = {identity[0], identity[1]};
  
     /* First, copy the source to tmp and set inactive lanes to the identity */
-   bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2));
+   bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm));
  
     for (unsigned i = 0; i < src.size(); i++) {
        /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
@@ -440,7 +442,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
     for (unsigned i = 0; i < src.size(); i++) {
        bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
                     vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
-                   Operand(stmp, s2));
+                   Operand(stmp, bld.lm));
     }
  
     bool exec_restored = false;
@@ -463,7 +465,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
        if (cluster_size == 32) {
           for (unsigned i = 0; i < src.size(); i++)
              bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10));
-         bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2));
+         bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
           exec_restored = true;
           emit_op(ctx, dst.physReg(), vtmp, tmp, PhysReg{0}, reduce_op, src.size());
           dst_written = true;
@@ -500,7 +502,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
                                           Operand(0xffffffffu), Operand(0xffffffffu)).instr;
              static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
           }
-         bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
+         bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX));
  
           /* fill in the gap in row 2 */
           for (unsigned i = 0; i < src.size(); i++) {
@@ -559,7 +561,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
     }
  
     if (!exec_restored)
-      bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2));
+      bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
  
     if (op == aco_opcode::p_reduce && cluster_size == 64) {
        for (unsigned k = 0; k < src.size(); k++) {
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp

index 28a779580a26f7f52cea75e037861973730b2e6a..68a0dc1576129e5e7c0e52252bf547745eec0f2b 100644 (file)
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -172,7 +172,7 @@ void setup_reduce_temp(Program* program)
              clobber_vcc = true;
  
           if (clobber_vcc)
-            instr->definitions[4] = Definition(vcc, s2);
+            instr->definitions[4] = Definition(vcc, bld.lm);
        }
     }
  }
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp

index c4144cc42f0cf77d6e5fa8acf85f30e0826fdec3..504ad0157463826b800a7b10573134bfc0f7727f 100644 (file)
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1719,6 +1719,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
  
                 pc->operands[i] = parallelcopy[i].first;
                 pc->definitions[i] = parallelcopy[i].second;
+               assert(pc->operands[i].size() == pc->definitions[i].size());
  
                 /* it might happen that the operand is already renamed. we have to restore the original name. */
                 std::map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp

index 3d76dcd88671b313c9bbbbee332a6be6e8c85a09..54e691ba476dbf9477e0c14891f5ada84469155a 100644 (file)
--- a/src/amd/compiler/aco_ssa_elimination.cpp
+++ b/src/amd/compiler/aco_ssa_elimination.cpp
@@ -58,6 +58,7 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
              std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
              phi_info& info = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info : ctx.linear_phi_info;
              const auto result = info.emplace(preds[i], std::vector<std::pair<Definition, Operand>>());
+            assert(phi->definitions[0].size() == phi->operands[i].size());
              result.first->second.emplace_back(phi->definitions[0], phi->operands[i]);
              ctx.empty_blocks[preds[i]] = false;
           }
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp

index 8d2bf8449db1a6e732d6b64f187ee50a6a537947..8282d7e27e371f814bdd4b017ad522b9d7823308 100644 (file)
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -190,7 +190,7 @@ void validate(Program* program, FILE * output)
                 }
              } else if (instr->opcode == aco_opcode::p_phi) {
                 check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == s2, "Logical Phi Definition must be vgpr or divergent boolean", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == program->lane_mask, "Logical Phi Definition must be vgpr or divergent boolean", instr.get());
              } else if (instr->opcode == aco_opcode::p_linear_phi) {
                 for (const Operand& op : instr->operands)
                    check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get());
author	Timur Kristóf <timur.kristof@gmail.com>
	Wed, 27 Nov 2019 10:04:47 +0000 (11:04 +0100)
committer	Daniel Schürmann <daniel@schuermann.dev>
	Wed, 4 Dec 2019 10:36:01 +0000 (10:36 +0000)
src/amd/compiler/aco_builder_h.py		patch \| blob \| history
src/amd/compiler/aco_insert_exec_mask.cpp		patch \| blob \| history
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_instruction_selection_setup.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.h		patch \| blob \| history
src/amd/compiler/aco_live_var_analysis.cpp		patch \| blob \| history
src/amd/compiler/aco_lower_bool_phis.cpp		patch \| blob \| history
src/amd/compiler/aco_lower_to_hw_instr.cpp		patch \| blob \| history
src/amd/compiler/aco_reduce_assign.cpp		patch \| blob \| history
src/amd/compiler/aco_register_allocation.cpp		patch \| blob \| history
src/amd/compiler/aco_ssa_elimination.cpp		patch \| blob \| history
src/amd/compiler/aco_validate.cpp		patch \| blob \| history