nir: Rename nir_intrinsic_barrier to control_barrier

[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 1222a11f2b4fc285239a13a7a4c3215ee2b7ce55..a106631b4c1e4955ee74da1642d98e5a0733742c 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -188,7 +188,7 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
     Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
     Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
     Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
-   Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi);
+   Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
  
     return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
                          bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
@@ -299,7 +299,7 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
        return;
     aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
     split->operands[0] = Operand(vec_src);
-   std::array<Temp,4> elems;
+   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
     for (unsigned i = 0; i < num_components; i++) {
        elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
        split->definitions[i] = Definition(elems[i]);
@@ -327,7 +327,7 @@ void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_compo
     }
  
     unsigned component_size = dst.size() / num_components;
-   std::array<Temp,4> elems;
+   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
  
     aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
     vec->definitions[0] = Definition(dst);
@@ -399,7 +399,7 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
        return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
     } else {
        assert(size <= 4);
-      std::array<Temp,4> elems;
+      std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
        aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
        for (unsigned i = 0; i < size; ++i) {
           elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
@@ -435,7 +435,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
     ctx->block->instructions.emplace_back(std::move(sop2));
  }
  
-void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
+void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
+                           bool commutative, bool swap_srcs=false, bool flush_denorms = false)
  {
     Builder bld(ctx->program, ctx->block);
     Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
@@ -457,10 +458,18 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
           src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
        }
     }
-   bld.vop2(op, Definition(dst), src0, src1);
+
+   if (flush_denorms && ctx->program->chip_class < GFX9) {
+      assert(dst.size() == 1);
+      Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
+      bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
+   } else {
+      bld.vop2(op, Definition(dst), src0, src1);
+   }
  }
  
-void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
+                            bool flush_denorms = false)
  {
     Temp src0 = get_alu_src(ctx, instr->src[0]);
     Temp src1 = get_alu_src(ctx, instr->src[1]);
@@ -476,7 +485,13 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode
        src2 = as_vgpr(ctx, src2);
  
     Builder bld(ctx->program, ctx->block);
-   bld.vop3(op, Definition(dst), src0, src1, src2);
+   if (flush_denorms && ctx->program->chip_class < GFX9) {
+      assert(dst.size() == 1);
+      Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
+      bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
+   } else {
+      bld.vop3(op, Definition(dst), src0, src1, src2);
+   }
  }
  
  void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
@@ -565,17 +580,18 @@ void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
  }
  
  void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
-                     aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::last_opcode, aco_opcode s64_op = aco_opcode::last_opcode)
+                     aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
  {
     aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
     aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
     bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
-   bool use_valu = s_op == aco_opcode::last_opcode ||
+   bool use_valu = s_op == aco_opcode::num_opcodes ||
                     divergent_vals ||
                     ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
                     ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
     aco_opcode op = use_valu ? v_op : s_op;
-   assert(op != aco_opcode::last_opcode);
+   assert(op != aco_opcode::num_opcodes);
+   assert(dst.regClass() == ctx->program->lane_mask);
  
     if (use_valu)
        emit_vopc_instruction(ctx, instr, op, dst);
@@ -734,7 +750,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_vec2:
     case nir_op_vec3:
     case nir_op_vec4: {
-      std::array<Temp,4> elems;
+      std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
        aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
        for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
           elems[i] = get_alu_src(ctx, instr->src[i]);
@@ -959,9 +975,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_ushr: {
        if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
-      } else if (dst.regClass() == v2) {
+      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
           bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
                    get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v2) {
+         bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
        } else if (dst.regClass() == s1) {
@@ -976,9 +995,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_ishl: {
        if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
-      } else if (dst.regClass() == v2) {
+      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
           bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
                    get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v2) {
+         bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
        } else if (dst.regClass() == s2) {
@@ -993,9 +1015,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_ishr: {
        if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
-      } else if (dst.regClass() == v2) {
+      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
           bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
                    get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v2) {
+         bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
+                  get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
        } else if (dst.regClass() == s2) {
@@ -1333,11 +1358,18 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fmax: {
        if (dst.size() == 1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
        } else if (dst.size() == 2) {
-         bld.vop3(aco_opcode::v_max_f64, Definition(dst),
-                  get_alu_src(ctx, instr->src[0]),
-                  as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+         if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
+            Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2),
+                                get_alu_src(ctx, instr->src[0]),
+                                as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+            bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
+         } else {
+            bld.vop3(aco_opcode::v_max_f64, Definition(dst),
+                     get_alu_src(ctx, instr->src[0]),
+                     as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+         }
        } else {
           fprintf(stderr, "Unimplemented NIR instr bit size: ");
           nir_print_instr(&instr->instr, stderr);
@@ -1347,11 +1379,18 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fmin: {
        if (dst.size() == 1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
        } else if (dst.size() == 2) {
-         bld.vop3(aco_opcode::v_min_f64, Definition(dst),
-                  get_alu_src(ctx, instr->src[0]),
-                  as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+         if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
+            Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2),
+                                get_alu_src(ctx, instr->src[0]),
+                                as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+            bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
+         } else {
+            bld.vop3(aco_opcode::v_min_f64, Definition(dst),
+                     get_alu_src(ctx, instr->src[0]),
+                     as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+         }
        } else {
           fprintf(stderr, "Unimplemented NIR instr bit size: ");
           nir_print_instr(&instr->instr, stderr);
@@ -1361,7 +1400,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fmax3: {
        if (dst.size() == 1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
           fprintf(stderr, "Unimplemented NIR instr bit size: ");
           nir_print_instr(&instr->instr, stderr);
@@ -1371,7 +1410,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fmin3: {
        if (dst.size() == 1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
           fprintf(stderr, "Unimplemented NIR instr bit size: ");
           nir_print_instr(&instr->instr, stderr);
@@ -1381,7 +1420,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fmed3: {
        if (dst.size() == 1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
           fprintf(stderr, "Unimplemented NIR instr bit size: ");
           nir_print_instr(&instr->instr, stderr);
@@ -1531,6 +1570,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.size() == 1) {
           bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
+         /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
+         // TODO: confirm that this holds under any circumstances
        } else if (dst.size() == 2) {
           Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
           VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
@@ -1651,12 +1692,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        aco_ptr<Instruction> norm;
        if (dst.size() == 1) {
-         Temp tmp;
-         Operand half_pi(0x3e22f983u);
-         if (src.type() == RegType::sgpr)
-            tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
-         else
-            tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
+         Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
+         Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src));
  
           /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
           if (ctx->options->chip_class < GFX9)
@@ -1866,7 +1903,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
           Temp new_exponent = bld.tmp(v1);
           Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
-         mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
+         if (ctx->program->chip_class >= GFX8)
+            mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
+         else
+            mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
           Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
           Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
           bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
@@ -1940,7 +1980,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
           Temp new_exponent = bld.tmp(v1);
           Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
-         mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
+         if (ctx->program->chip_class >= GFX8)
+            mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
+         else
+            mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
           Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
           bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
           lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
@@ -2057,7 +2100,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_i2i64: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (src.regClass() == s1) {
-         Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
+         Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
        } else if (src.regClass() == v1) {
           Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
@@ -2100,12 +2143,18 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  
        if (src.type() == RegType::vgpr) {
           assert(src.regClass() == v1 || src.regClass() == v2);
+         assert(dst.regClass() == bld.lm);
           bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
                    Definition(dst), Operand(0u), src).def(0).setHint(vcc);
        } else {
           assert(src.regClass() == s1 || src.regClass() == s2);
-         Temp tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
-                             bld.scc(bld.def(s1)), Operand(0u), src);
+         Temp tmp;
+         if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
+            tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
+         } else {
+            tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
+                           bld.scc(bld.def(s1)), Operand(0u), src);
+         }
           bool_to_vector_condition(ctx, tmp, dst);
        }
        break;
@@ -2182,7 +2231,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
            */
           f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
           Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
-         Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), f32, smallest);
+         Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
           static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
           cmp_res = vop3->definitions[0].getTemp();
        }
@@ -2358,14 +2407,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (instr->src[0].src.ssa->bit_size == 1)
           emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
        else
-         emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64);
+         emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
+                         ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
        break;
     }
     case nir_op_ine: {
        if (instr->src[0].src.ssa->bit_size == 1)
           emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
        else
-         emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64);
+         emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
+                         ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
        break;
     }
     case nir_op_ult: {
@@ -2705,7 +2756,7 @@ void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
     unsigned bytes_read = 0;
     unsigned result_size = 0;
     unsigned total_bytes = num_components * elem_size_bytes;
-   std::array<Temp, 4> result;
+   std::array<Temp, NIR_MAX_VEC_COMPONENTS> result;
  
     while (bytes_read < total_bytes) {
        unsigned todo = total_bytes - bytes_read;
@@ -3326,9 +3377,6 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
  
     aco_opcode op;
     if (dst.type() == RegType::vgpr || (ctx->options->chip_class < GFX8 && !readonly)) {
-      if (ctx->options->chip_class < GFX8)
-         offset = as_vgpr(ctx, offset);
-
        Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
        Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
        unsigned const_offset = 0;
@@ -3401,10 +3449,11 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
           Temp vec = bld.tmp(RegType::vgpr, dst.size());
           instr->definitions[0] = Definition(vec);
           bld.insert(std::move(instr));
-         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
+         expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
        } else {
           instr->definitions[0] = Definition(dst);
           bld.insert(std::move(instr));
+         emit_split_vector(ctx, dst, num_components);
        }
     } else {
        switch (num_bytes) {
@@ -3460,9 +3509,8 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
        } else {
           bld.insert(std::move(load));
        }
-
+      emit_split_vector(ctx, dst, num_components);
     }
-   emit_split_vector(ctx, dst, num_components);
  }
  
  void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -3484,7 +3532,7 @@ void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
                             S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
        if (ctx->options->chip_class >= GFX10) {
           desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                      S_008F0C_OOB_SELECT(3) |
+                      S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
                        S_008F0C_RESOURCE_LEVEL(1);
        } else {
           desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
@@ -3587,7 +3635,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
                          S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
     if (ctx->options->chip_class >= GFX10) {
        desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                   S_008F0C_OOB_SELECT(3) |
+                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
                     S_008F0C_RESOURCE_LEVEL(1);
     } else {
        desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
@@ -4032,6 +4080,15 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr
           coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
     }
  
+   if (instr->intrinsic == nir_intrinsic_image_deref_load ||
+       instr->intrinsic == nir_intrinsic_image_deref_store) {
+      int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
+      bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
+
+      if (!level_zero)
+         coords.emplace_back(Operand(get_ssa_temp(ctx, instr->src[lod_index].ssa)));
+   }
+
     aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
     for (unsigned i = 0; i < coords.size(); i++)
        vec->operands[i] = coords[i];
@@ -4105,7 +4162,10 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
     else
        tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
  
-   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
+   bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
+   aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
+
+   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 2, 1)};
     load->operands[0] = Operand(coords);
     load->operands[1] = Operand(resource);
     load->definitions[0] = Definition(tmp);
@@ -4171,7 +4231,10 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp coords = get_image_coords(ctx, instr, type);
     Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
  
-   aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
+   bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
+   aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
+
+   aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 4, 0)};
     store->operands[0] = Operand(coords);
     store->operands[1] = Operand(resource);
     store->operands[2] = Operand(s4);
@@ -4309,31 +4372,27 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
  void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
  {
     if (in_elements && ctx->options->chip_class == GFX8) {
+      /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
        Builder bld(ctx->program, ctx->block);
  
+      Temp size = emit_extract_vector(ctx, desc, 2, s1);
+
+      Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
+      size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
+
        Temp stride = emit_extract_vector(ctx, desc, 1, s1);
        stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
-      stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
-      stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
  
-      Temp size = emit_extract_vector(ctx, desc, 2, s1);
-      size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
-
-      Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
-      res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
-      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
-
-      // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
-      /* idea
-       * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
-       * in case 12 (or 3?), we have to divide by 3:
-       * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
-       * use v_mul_hi_u32 with magic number to divide
-       * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
-       * disable v_skip
-       * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
-       */
+      Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
+      size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
  
+      Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
+      bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
+               size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
+      if (dst.type() == RegType::vgpr)
+         bld.copy(Definition(dst), shr_dst);
+
+      /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
     } else {
        emit_extract_vector(ctx, desc, 2, dst);
     }
@@ -4363,7 +4422,7 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
     aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
     mimg->operands[0] = Operand(lod);
     mimg->operands[1] = Operand(resource);
-   unsigned& dmask = mimg->dmask;
+   uint8_t& dmask = mimg->dmask;
     mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
     mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
     mimg->da = glsl_sampler_type_is_array(type);
@@ -4420,12 +4479,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
     unsigned writemask = nir_intrinsic_write_mask(instr);
-
-   Temp offset;
-   if (ctx->options->chip_class < GFX8)
-      offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
-   else
-      offset = get_ssa_temp(ctx, instr->src[2].ssa);
+   Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
  
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
@@ -4562,12 +4616,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
        data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
                          get_ssa_temp(ctx, instr->src[3].ssa), data);
  
-   Temp offset;
-   if (ctx->options->chip_class < GFX8)
-      offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
-   else
-      offset = get_ssa_temp(ctx, instr->src[1].ssa);
-
+   Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
  
@@ -5069,7 +5118,7 @@ Temp get_scratch_resource(isel_context *ctx)
  
     if (ctx->program->chip_class >= GFX10) {
        rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                   S_008F0C_OOB_SELECT(3) |
+                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
                     S_008F0C_RESOURCE_LEVEL(1);
     } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
        rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
@@ -5283,7 +5332,10 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te
           tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
  
        uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
-      if (ctx->program->wave_size == 64)
+
+      if (ctx->program->chip_class <= GFX7)
+         tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
+      else if (ctx->program->wave_size == 64)
           tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
        else
           tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
@@ -5662,7 +5714,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_get_buffer_size:
        visit_get_buffer_size(ctx, instr);
        break;
-   case nir_intrinsic_barrier: {
+   case nir_intrinsic_control_barrier: {
        unsigned* bsize = ctx->program->info->cs.block_size;
        unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
        if (workgroup_size > ctx->program->wave_size)
@@ -5677,6 +5729,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_memory_barrier_shared:
        emit_memory_barrier(ctx, instr);
        break;
+   case nir_intrinsic_memory_barrier_tcs_patch:
+      break;
     case nir_intrinsic_load_num_work_groups: {
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
        bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
@@ -5789,7 +5843,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
              assert(src.regClass() == bld.lm);
              Temp tmp;
-            if (ctx->program->wave_size == 64)
+            if (ctx->program->chip_class <= GFX7)
+               tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
+            else if (ctx->program->wave_size == 64)
                 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
              else
                 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
@@ -6141,6 +6197,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     }
     case nir_intrinsic_demote:
        bld.pseudo(aco_opcode::p_demote_to_helper);
+
+      if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+         ctx->cf_info.exec_potentially_empty = true;
        ctx->block->kind |= block_kind_uses_demote;
        ctx->program->needs_exact = true;
        break;
@@ -6149,6 +6208,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        assert(src.regClass() == bld.lm);
        Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
        bld.pseudo(aco_opcode::p_demote_to_helper, cond);
+
+      if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+         ctx->cf_info.exec_potentially_empty = true;
        ctx->block->kind |= block_kind_uses_demote;
        ctx->program->needs_exact = true;
        break;
@@ -6286,7 +6348,7 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
     Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
  
     Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
-   Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
+   Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
     is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
     Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
  
@@ -7108,7 +7170,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
     if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
        // TODO: scalarize linear phis on divergent ifs
        bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
-      std::array<Temp, 4> new_vec;
+      std::array<Temp, NIR_MAX_VEC_COMPONENTS> new_vec;
        for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
           Operand src = operands[i];
           if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
@@ -7681,7 +7743,10 @@ static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex
        else
           exp->operands[i] = Operand(v1);
     }
-   exp->valid_mask = false;
+   /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+    * Setting valid_mask=1 prevents it and has no other effect.
+    */
+   exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
     exp->done = false;
     exp->compressed = false;
     if (is_pos)
@@ -7721,7 +7786,7 @@ static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
           exp->enabled_mask |= 0x4;
        }
     }
-   exp->valid_mask = false;
+   exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
     exp->done = false;
     exp->compressed = false;
     exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
@@ -7857,7 +7922,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
  
     Temp tid = emit_mbcnt(ctx, bld.def(v1));
  
-   Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
+   Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
  
     if_context ic;
     begin_divergent_if_then(ctx, &ic, can_emit);
@@ -7908,7 +7973,7 @@ void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
     /* Split all arguments except for the first (ring_offsets) and the last
      * (exec) so that the dead channels don't stay live throughout the program.
      */
-   for (unsigned i = 1; i < startpgm->definitions.size() - 1; i++) {
+   for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
        if (startpgm->definitions[i].regClass().size() > 1) {
           emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
                             startpgm->definitions[i].regClass().size());