From 6a586a60067ccc7337a3bb047e21ecc2384cc56a Mon Sep 17 00:00:00 2001
From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= <daniel@schuermann.dev>
Date: Thu, 7 Nov 2019 18:02:33 +0100
Subject: [PATCH] aco: split read/writelane opcode into VOP2/VOP3 version for
 SI/CI

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
---
 src/amd/compiler/aco_builder_h.py             | 13 +++++++
 src/amd/compiler/aco_insert_NOPs.cpp          |  8 +++--
 src/amd/compiler/aco_insert_exec_mask.cpp     |  4 ++-
 .../compiler/aco_instruction_selection.cpp    |  8 ++---
 src/amd/compiler/aco_lower_to_hw_instr.cpp    | 34 +++++++++----------
 src/amd/compiler/aco_opcodes.py               |  6 ++--
 src/amd/compiler/aco_optimizer.cpp            | 16 ++++++---
 src/amd/compiler/aco_register_allocation.cpp  |  3 +-
 src/amd/compiler/aco_validate.cpp             | 15 ++++++--
 9 files changed, 72 insertions(+), 35 deletions(-)
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index ada0806f6a9..d215c7b198f 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -418,6 +418,19 @@ public:
       return insert(std::move(sub));
    }
 
+   Result readlane(Definition dst, Op vsrc, Op lane)
+   {
+      if (program->chip_class >= GFX8)
+         return vop3(aco_opcode::v_readlane_b32_e64, dst, vsrc, lane);
+      else
+         return vop2(aco_opcode::v_readlane_b32, dst, vsrc, lane);
+   }
+   Result writelane(Definition dst, Op val, Op lane, Op vsrc) {
+      if (program->chip_class >= GFX8)
+         return vop3(aco_opcode::v_writelane_b32_e64, dst, val, lane, vsrc);
+      else
+         return vop2(aco_opcode::v_writelane_b32, dst, val, lane, vsrc);
+   }
 <%
 import itertools
 formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]),
diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
index 29bc8375ffc..1ead0c04da5 100644
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -110,7 +110,9 @@ bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
       return true;
    if (instr->isVOP3() && instr->definitions.size() == 2)
       return true;
-   if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
+   if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
+       instr->opcode == aco_opcode::v_readlane_b32 ||
+       instr->opcode == aco_opcode::v_readlane_b32_e64)
       return true;
    return false;
 }
@@ -285,7 +287,9 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr<Instruction>& instr,
 
       switch (instr->opcode) {
          case aco_opcode::v_readlane_b32:
-         case aco_opcode::v_writelane_b32: {
+         case aco_opcode::v_readlane_b32_e64:
+         case aco_opcode::v_writelane_b32:
+         case aco_opcode::v_writelane_b32_e64: {
             if (ctx.VALU_wrsgpr + 4 < new_idx)
                break;
             PhysReg reg = instr->operands[1].physReg();
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index cbc0698096b..607b4f52793 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -118,7 +118,9 @@ bool pred_by_exec_mask(aco_ptr<Instruction>& instr) {
    }
 
    if (instr->opcode == aco_opcode::v_readlane_b32 ||
-       instr->opcode == aco_opcode::v_writelane_b32)
+       instr->opcode == aco_opcode::v_readlane_b32_e64 ||
+       instr->opcode == aco_opcode::v_writelane_b32 ||
+       instr->opcode == aco_opcode::v_writelane_b32_e64)
       return false;
 
    return true;
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index f05c1df9d03..1222a11f2b4 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -163,7 +163,7 @@ Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_ne
 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
 {
    if (index.regClass() == s1)
-      return bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), data, index);
+      return bld.readlane(bld.def(s1), data, index);
 
    Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 
@@ -6098,14 +6098,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       if (dst.regClass() == v1) {
          /* src2 is ignored for writelane. RA assigns the same reg for dst */
-         emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
+         emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
       } else if (dst.regClass() == v2) {
          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
-         Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
-         Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
+         Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
+         Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
          emit_split_vector(ctx, dst, 2);
       } else {
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 19e0f598074..38e6030687b 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -481,7 +481,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
 
          if (cluster_size == 64) {
             for (unsigned i = 0; i < src.size(); i++)
-               bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+               bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
             emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
          }
       } else if (cluster_size == 32) {
@@ -519,8 +519,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          if (ctx->program->wave_size == 64) {
             /* fill in the gap in row 2 */
             for (unsigned i = 0; i < src.size(); i++) {
-               bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
-               bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
+               bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+               bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
             }
          }
          std::swap(tmp, vtmp);
@@ -531,8 +531,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
             if (ctx->program->chip_class < GFX10)
                assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
-            bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
-                     identity[i], Operand(0u));
+            bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));
          }
       }
       /* fall through */
@@ -562,7 +561,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
             bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
             bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
             for (unsigned i = 0; i < src.size(); i++)
-               bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+               bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
             emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
          }
       } else {
@@ -581,8 +580,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
 
    if (op == aco_opcode::p_reduce && dst.regClass().type() == RegType::sgpr) {
       for (unsigned k = 0; k < src.size(); k++) {
-         bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1),
-                  Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
+         bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1),
+                      Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
       }
    } else if (!(dst.physReg() == tmp) && !dst_written) {
       for (unsigned k = 0; k < src.size(); k++) {
@@ -911,21 +910,20 @@ void lower_to_hw_instr(Program* program)
             case aco_opcode::p_spill:
             {
                assert(instr->operands[0].regClass() == v1.as_linear());
-               for (unsigned i = 0; i < instr->operands[2].size(); i++) {
-                  bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1, instr->operands[0].physReg()),
-                           Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
-                           Operand(instr->operands[1].constantValue() + i));
-               }
+               for (unsigned i = 0; i < instr->operands[2].size(); i++)
+                  bld.writelane(bld.def(v1, instr->operands[0].physReg()),
+                                Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
+                                Operand(instr->operands[1].constantValue() + i),
+                                instr->operands[0]);
                break;
             }
             case aco_opcode::p_reload:
             {
                assert(instr->operands[0].regClass() == v1.as_linear());
-               for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
-                  bld.vop3(aco_opcode::v_readlane_b32,
-                           bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
-                           instr->operands[0], Operand(instr->operands[1].constantValue() + i));
-               }
+               for (unsigned i = 0; i < instr->definitions[0].size(); i++)
+                  bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
+                               instr->operands[0],
+                               Operand(instr->operands[1].constantValue() + i));
                break;
             }
             case aco_opcode::p_as_uniform:
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index f9697420ae0..65e739b0644 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -592,6 +592,8 @@ for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM:
 VOP2 = {
   # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers
    (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False),
+   (0x01, 0x01,   -1,   -1,   -1, "v_readlane_b32", False),
+   (0x02, 0x02,   -1,   -1,   -1, "v_writelane_b32", False),
    (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True),
    (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True),
    (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True),
@@ -984,8 +986,8 @@ VOP3 = {
    (   -1,    -1, 0x276, 0x276,    -1, "v_interp_p2_legacy_f16", True, True),
    (   -1,    -1,    -1, 0x277, 0x35a, "v_interp_p2_f16", True, True),
    (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True),
-   (0x101, 0x101, 0x289, 0x289, 0x360, "v_readlane_b32", False, False),
-   (0x102, 0x102, 0x28a, 0x28a, 0x361, "v_writelane_b32", False, False),
+   (   -1,    -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False),
+   (   -1,    -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False),
    (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
    (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
    (0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False),
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index e92531030c8..cea466e3819 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -421,14 +421,19 @@ bool can_use_VOP3(aco_ptr<Instruction>& instr)
    return instr->opcode != aco_opcode::v_madmk_f32 &&
           instr->opcode != aco_opcode::v_madak_f32 &&
           instr->opcode != aco_opcode::v_madmk_f16 &&
-          instr->opcode != aco_opcode::v_madak_f16;
+          instr->opcode != aco_opcode::v_madak_f16 &&
+          instr->opcode != aco_opcode::v_readlane_b32 &&
+          instr->opcode != aco_opcode::v_writelane_b32 &&
+          instr->opcode != aco_opcode::v_readfirstlane_b32;
 }
 
 bool can_apply_sgprs(aco_ptr<Instruction>& instr)
 {
    return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
           instr->opcode != aco_opcode::v_readlane_b32 &&
-          instr->opcode != aco_opcode::v_writelane_b32;
+          instr->opcode != aco_opcode::v_readlane_b32_e64 &&
+          instr->opcode != aco_opcode::v_writelane_b32 &&
+          instr->opcode != aco_opcode::v_writelane_b32_e64;
 }
 
 void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
@@ -458,6 +463,7 @@ bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand)
    case aco_opcode::v_interp_p2_f32:
    case aco_opcode::v_mac_f32:
    case aco_opcode::v_writelane_b32:
+   case aco_opcode::v_writelane_b32_e64:
    case aco_opcode::v_cndmask_b32:
       return operand != 2;
    case aco_opcode::s_addk_i32:
@@ -466,6 +472,7 @@ bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand)
    case aco_opcode::p_extract_vector:
    case aco_opcode::p_split_vector:
    case aco_opcode::v_readlane_b32:
+   case aco_opcode::v_readlane_b32_e64:
    case aco_opcode::v_readfirstlane_b32:
       return operand != 0;
    default:
@@ -494,7 +501,8 @@ bool valu_can_accept_literal(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned
 
 bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
 {
-   if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32)
+   if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
+       instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
       return operand != 1;
    return true;
 }
@@ -633,7 +641,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
          }
          if (info.is_constant() && can_accept_constant(instr, i)) {
             perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
-            if (i == 0) {
+            if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) {
                instr->operands[i] = Operand(info.val);
                continue;
             } else if (!instr->isVOP3() && can_swap_operands(instr)) {
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index 504ad015746..834f06d2b94 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1509,7 +1509,8 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
          /* handle definitions which must have the same register as an operand */
          if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
              instr->opcode == aco_opcode::v_mac_f32 ||
-             instr->opcode == aco_opcode::v_writelane_b32) {
+             instr->opcode == aco_opcode::v_writelane_b32 ||
+             instr->opcode == aco_opcode::v_writelane_b32_e64) {
             instr->definitions[0].setFixed(instr->operands[2].physReg());
          } else if (instr->opcode == aco_opcode::s_addk_i32 ||
                     instr->opcode == aco_opcode::s_mulk_i32) {
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index 8282d7e27e3..655fb0c3bce 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -132,12 +132,21 @@ void validate(Program* program, FILE * output)
                check(instr->definitions[0].getTemp().type() == RegType::vgpr ||
                      (int) instr->format & (int) Format::VOPC ||
                      instr->opcode == aco_opcode::v_readfirstlane_b32 ||
-                     instr->opcode == aco_opcode::v_readlane_b32,
+                     instr->opcode == aco_opcode::v_readlane_b32 ||
+                     instr->opcode == aco_opcode::v_readlane_b32_e64,
                      "Wrong Definition type for VALU instruction", instr.get());
                unsigned num_sgpr = 0;
                unsigned sgpr_idx = instr->operands.size();
-               for (unsigned i = 0; i < instr->operands.size(); i++)
-               {
+               for (unsigned i = 0; i < instr->operands.size(); i++) {
+                  if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
+                      instr->opcode == aco_opcode::v_readlane_b32 ||
+                      instr->opcode == aco_opcode::v_readlane_b32_e64 ||
+                      instr->opcode == aco_opcode::v_writelane_b32 ||
+                      instr->opcode == aco_opcode::v_writelane_b32_e64) {
+                     check(!instr->operands[i].isLiteral(), "No literal allowed on VALU instruction", instr.get());
+                     check(i == 1 || (instr->operands[i].isTemp() && instr->operands[i].regClass() == v1), "Wrong Operand type for VALU instruction", instr.get());
+                     continue;
+                  }
                   if (instr->operands[i].isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) {
                      check(i != 1 || (int) instr->format & (int) Format::VOP3A, "Wrong source position for SGPR argument", instr.get());
 
-- 
2.30.2