aco: don't split store data if it was already split into more elements

[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index c0cc445ffa38bd5fbd1863f644ee3c961304384c..80ea1e133d661420025877f3f024807687d4555e 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -136,8 +136,11 @@ Temp emit_mbcnt(isel_context *ctx, Definition dst,
  
     if (ctx->program->wave_size == 32) {
        return thread_id_lo;
+   } else if (ctx->program->chip_class <= GFX7) {
+      Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+      return thread_id_hi;
     } else {
-      Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+      Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
        return thread_id_hi;
     }
  }
@@ -204,6 +207,36 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
     }
  }
  
+static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
+{
+   if (ctx->options->chip_class >= GFX8) {
+      unsigned and_mask = mask & 0x1f;
+      unsigned or_mask = (mask >> 5) & 0x1f;
+      unsigned xor_mask = (mask >> 10) & 0x1f;
+
+      uint16_t dpp_ctrl = 0xffff;
+
+      // TODO: we could use DPP8 for some swizzles
+      if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
+         unsigned res[4] = {0, 1, 2, 3};
+         for (unsigned i = 0; i < 4; i++)
+            res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
+         dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
+         dpp_ctrl = dpp_row_rr(8);
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
+         dpp_ctrl = dpp_row_mirror;
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
+         dpp_ctrl = dpp_row_half_mirror;
+      }
+
+      if (dpp_ctrl != 0xffff)
+         return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
+   }
+
+   return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
+}
+
  Temp as_vgpr(isel_context *ctx, Temp val)
  {
     if (val.type() == RegType::sgpr) {
@@ -402,7 +435,7 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
        bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
        hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
        if (select != Temp())
-         hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select);
+         hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
        lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
        Temp mid = bld.tmp(s1);
        lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
@@ -439,7 +472,7 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
        offset = Operand(0u);
     }
  
-   unsigned num_components = dst.bytes() / component_size;
+   unsigned num_components = vec.bytes() / component_size;
     if (vec.regClass() == dst.regClass()) {
        assert(offset.constantValue() == 0);
        bld.copy(Definition(dst), vec);
@@ -447,17 +480,18 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
        return;
     }
  
-   emit_split_vector(ctx, vec, vec.bytes() / component_size);
+   emit_split_vector(ctx, vec, num_components);
     std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
     RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
  
     assert(offset.constantValue() % component_size == 0);
     unsigned skip = offset.constantValue() / component_size;
-   for (unsigned i = 0; i < num_components; i++)
-      elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
+   for (unsigned i = skip; i < num_components; i++)
+      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
  
     /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
     if (dst.type() == RegType::vgpr) {
+      num_components = dst.bytes() / component_size;
        aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
        for (unsigned i = 0; i < num_components; i++)
           create_vec->operands[i] = Operand(elems[i]);
@@ -581,6 +615,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
     sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
     sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
     sop2->definitions[0] = Definition(dst);
+   if (instr->no_unsigned_wrap)
+      sop2->definitions[0].setNUW(true);
     if (writes_scc)
        sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
     ctx->block->instructions.emplace_back(std::move(sop2));
@@ -613,6 +649,31 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
     }
  }
  
+void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
+                                   aco_opcode op, Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
+
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+   if (src1.type() == RegType::sgpr) {
+      assert(src0.type() == RegType::vgpr);
+      std::swap(src0, src1);
+   }
+
+   Temp src00 = bld.tmp(src0.type(), 1);
+   Temp src01 = bld.tmp(src0.type(), 1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+   Temp src10 = bld.tmp(v1);
+   Temp src11 = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+   Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
+   Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
+   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+}
+
  void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
                              bool flush_denorms = false)
  {
@@ -954,7 +1015,8 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     if (ctx->options->chip_class >= GFX7)
        return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
  
-   /* GFX6 doesn't support V_FLOOR_F64, lower it. */
+   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
+    * lowered at NIR level for precision reasons). */
     Temp src0 = as_vgpr(ctx, val);
  
     Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
@@ -1125,6 +1187,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
+      } else if (dst.regClass() == v2) {
+         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+         lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
+         hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
        } else if (dst.type() == RegType::sgpr) {
           aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
           bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
@@ -1260,6 +1328,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           emit_boolean_logic(ctx, instr, Builder::s_or, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
        } else if (dst.regClass() == s2) {
@@ -1276,6 +1346,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           emit_boolean_logic(ctx, instr, Builder::s_and, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
        } else if (dst.regClass() == s2) {
@@ -1292,6 +1364,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
        } else if (dst.regClass() == s2) {
@@ -1870,6 +1944,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           emit_rsq(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
        } else {
           fprintf(stderr, "Unimplemented NIR instr bit size: ");
@@ -1881,6 +1956,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fneg: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
           bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
        } else if (dst.regClass() == v1) {
           if (ctx->block->fp_mode.must_flush_denorms32)
@@ -1903,6 +1980,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fabs: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
           bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
        } else if (dst.regClass() == v1) {
           if (ctx->block->fp_mode.must_flush_denorms32)
@@ -1961,6 +2040,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           emit_rcp(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
        } else {
           fprintf(stderr, "Unimplemented NIR instr bit size: ");
@@ -1988,6 +2068,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           emit_sqrt(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
        } else {
           fprintf(stderr, "Unimplemented NIR instr bit size: ");
@@ -2220,7 +2301,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 64)
           src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
-      bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
+      if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
+         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
+          * keep value numbering and the scheduler simpler.
+          */
+         bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
+      else
+         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
        break;
     }
     case nir_op_f2f16_rtz: {
@@ -2609,18 +2696,25 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        break;
     }
     case nir_op_b2b32:
-   case nir_op_b2i32: {
+   case nir_op_b2i8:
+   case nir_op_b2i16:
+   case nir_op_b2i32:
+   case nir_op_b2i64: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        assert(src.regClass() == bld.lm);
  
-      if (dst.regClass() == s1) {
+      Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
+      if (tmp.regClass() == s1) {
           // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
-         bool_to_scalar_condition(ctx, src, dst);
-      } else if (dst.regClass() == v1) {
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
+         bool_to_scalar_condition(ctx, src, tmp);
+      } else if (tmp.type() == RegType::vgpr) {
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
        } else {
           unreachable("Invalid register class for b2i32");
        }
+
+      if (tmp != dst)
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
        break;
     }
     case nir_op_b2b1:
@@ -3074,7 +3168,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
  
        if (byte_align) {
-         if ((bytes_needed > 2 || !supports_8bit_16bit_loads) && byte_align_loads) {
+         if ((bytes_needed > 2 ||
+              (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
+              !supports_8bit_16bit_loads) && byte_align_loads) {
              if (info->component_stride) {
                 assert(supports_8bit_16bit_loads && "unimplemented");
                 bytes_needed = 2;
@@ -3136,7 +3232,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
  
        /* align offset down if needed */
        Operand aligned_offset = offset;
+      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        if (need_to_align_offset) {
+         align = 4;
           Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
           if (offset.isConstant()) {
              aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
@@ -3156,7 +3254,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
                                  bld.copy(bld.def(s1), aligned_offset);
  
-      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
                            reduced_const_offset, byte_align ? Temp() : info->dst);
  
@@ -3218,7 +3315,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        if (num_tmps > 1) {
           aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
              aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
-         for (unsigned i = 0; i < num_vals; i++)
+         for (unsigned i = 0; i < num_tmps; i++)
              vec->operands[i] = Operand(tmp[i]);
           tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
           vec->definitions[0] = Definition(tmp[0]);
@@ -3417,10 +3514,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
  
     unsigned bytes_size = 0;
     aco_opcode op;
-   if (bytes_needed == 1) {
+   if (bytes_needed == 1 || align_ % 2) {
        bytes_size = 1;
        op = aco_opcode::buffer_load_ubyte;
-   } else if (bytes_needed == 2) {
+   } else if (bytes_needed == 2 || align_ % 4) {
        bytes_size = 2;
        op = aco_opcode::buffer_load_ushort;
     } else if (bytes_needed <= 4) {
@@ -3446,7 +3543,8 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
     mubuf->barrier = info->barrier;
     mubuf->can_reorder = info->can_reorder;
     mubuf->offset = const_offset;
-   RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+   mubuf->swizzled = info->swizzle_component_size != 0;
+   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
     Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
     mubuf->definitions[0] = Definition(val);
     bld.insert(std::move(mubuf));
@@ -3455,6 +3553,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
  }
  
  static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
  
  Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
  {
@@ -3579,13 +3678,15 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
        /* use allocated_vec if possible */
        auto it = ctx->allocated_vec.find(src.id());
        if (it != ctx->allocated_vec.end()) {
-         unsigned total_size = 0;
-         for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
-            total_size += it->second[i].bytes();
-         if (total_size != src.bytes())
+         if (!it->second[0].id())
              goto split;
-
           unsigned elem_size = it->second[0].bytes();
+         assert(src.bytes() % elem_size == 0);
+
+         for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
+            if (!it->second[i].id())
+               goto split;
+         }
  
           for (unsigned i = 0; i < count; i++) {
              if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
@@ -3617,10 +3718,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
        }
     }
  
+   split:
+
     if (dst_type == RegType::sgpr)
        src = bld.as_uniform(src);
  
-   split:
     /* just split it */
     aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
     split->operands[0] = Operand(src);
@@ -3820,10 +3922,10 @@ void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem
  
        /* dword or larger stores have to be dword-aligned */
        unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
-      unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
-      bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
-      if (bytes >= 4 && !dword_aligned)
-         bytes = MIN2(bytes, 2);
+      unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
+      bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
+      if (!dword_aligned)
+         bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
  
        advance_write_mask(&todo, offset, bytes);
        write_count_with_skips++;
@@ -3897,7 +3999,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
  }
  
  void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
-                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
+                             bool swizzled = false)
  {
     assert(vdata.id());
     assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
@@ -3910,8 +4013,9 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
     Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
     Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
     Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
-                                 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
-                                 /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+                                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+                                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+                                 /* dlc*/ false, /* slc */ slc);
  
     static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
  }
@@ -3933,7 +4037,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
  
     for (unsigned i = 0; i < write_count; i++) {
        unsigned const_offset = offsets[i] + base_const_offset;
-      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
+      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
     }
  }
  
@@ -4745,7 +4849,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
           if (use_mubuf) {
              Instruction *mubuf = bld.mubuf(opcode,
                                             Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_offset, false, true).instr;
+                                           fetch_offset, false, false, true).instr;
              static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
           } else {
              Instruction *mtbuf = bld.mtbuf(opcode,
@@ -5104,11 +5208,11 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
  
  void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
                   Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
-                 bool glc=false, bool readonly=true)
+                 bool glc=false, bool readonly=true, bool allow_smem=true)
  {
     Builder bld(ctx->program, ctx->block);
  
-   bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly);
+   bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
     if (use_smem)
        offset = bld.as_uniform(offset);
  
@@ -5191,7 +5295,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
     if (offset != 0) // TODO check if index != 0 as well
-      index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
     Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
     Temp vec = dst;
     bool trim = false;
@@ -5233,7 +5337,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
        unreachable("unimplemented or forbidden load_push_constant.");
     }
  
-   bld.smem(op, Definition(vec), ptr, index);
+   static_cast<SMEM_instruction*>(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true;
  
     if (!aligned) {
        Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
@@ -5277,7 +5381,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
     if (base && offset.type() == RegType::sgpr)
-      offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
     else if (base && offset.type() == RegType::vgpr)
        offset = bld.vadd32(bld.def(v1), Operand(base), offset);
  
@@ -6103,10 +6207,19 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
  
-   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+   unsigned access = nir_intrinsic_access(instr);
+   bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
     unsigned size = instr->dest.ssa.bit_size / 8;
+
+   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
+   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+    * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+    */
+   bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
+   allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
+
     load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
-               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false);
+               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
  }
  
  void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -6120,9 +6233,17 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
  
+   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
+   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+    * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+    */
+   bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
+
     bool smem = !nir_src_is_divergent(instr->src[2]) &&
                 ctx->options->chip_class >= GFX8 &&
-               elem_size_bytes >= 4;
+               (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
+               allow_smem;
     if (smem)
        offset = bld.as_uniform(offset);
     bool smem_nonfs = smem && ctx->stage != fragment_fs;
@@ -6142,8 +6263,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
           store->operands[0] = Operand(rsrc);
           if (offsets[i]) {
-            Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                offset, Operand(offsets[i]));
+            Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+                                      offset, Operand(offsets[i]));
              store->operands[1] = Operand(off);
           } else {
              store->operands[1] = Operand(offset);
@@ -6151,7 +6272,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           if (op != aco_opcode::p_fs_buffer_store_smem)
              store->operands[1].setFixed(m0);
           store->operands[2] = Operand(write_datas[i]);
-         store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+         store->glc = glc;
           store->dlc = false;
           store->disable_wqm = true;
           store->barrier = barrier_buffer;
@@ -6169,7 +6290,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           store->operands[3] = Operand(write_datas[i]);
           store->offset = offsets[i];
           store->offen = (offset.type() == RegType::vgpr);
-         store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+         store->glc = glc;
           store->dlc = false;
           store->disable_wqm = true;
           store->barrier = barrier_buffer;
@@ -6678,6 +6799,12 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
           op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
           num_operands = 4;
           break;
+      case nir_intrinsic_shared_atomic_fadd:
+         op32 = aco_opcode::ds_add_f32;
+         op32_rtn = aco_opcode::ds_add_rtn_f32;
+         op64 = aco_opcode::num_opcodes;
+         op64_rtn = aco_opcode::num_opcodes;
+         break;
        default:
           unreachable("Unhandled shared atomic intrinsic");
     }
@@ -6728,7 +6855,7 @@ Temp get_scratch_resource(isel_context *ctx)
        scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
  
     uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
-                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
  
     if (ctx->program->chip_class >= GFX10) {
        rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
@@ -6739,9 +6866,9 @@ Temp get_scratch_resource(isel_context *ctx)
                     S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
     }
  
-   /* older generations need element size = 16 bytes. element size removed in GFX9 */
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
     if (ctx->program->chip_class <= GFX8)
-      rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
  
     return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
  }
@@ -6756,10 +6883,10 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
                          instr->dest.ssa.bit_size / 8u, rsrc};
     info.align_mul = nir_intrinsic_align_mul(instr);
     info.align_offset = nir_intrinsic_align_offset(instr);
-   info.swizzle_component_size = 16;
+   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
     info.can_reorder = false;
     info.soffset = ctx->program->scratch_offset;
-   emit_mubuf_load(ctx, bld, &info);
+   emit_scratch_load(ctx, bld, &info);
  }
  
  void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
@@ -6774,12 +6901,13 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
     unsigned write_count = 0;
     Temp write_datas[32];
     unsigned offsets[32];
+   unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
     split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
-                      16, &write_count, write_datas, offsets);
+                      swizzle_component_size, &write_count, write_datas, offsets);
  
     for (unsigned i = 0; i < write_count; i++) {
        aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
+      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
     }
  }
  
@@ -7160,6 +7288,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
        nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
        Temp private_segment_buffer = ctx->program->private_segment_buffer;
+      //TODO: bounds checking?
        if (addr.type() == RegType::sgpr) {
           Operand offset;
           if (const_addr) {
@@ -7332,6 +7461,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_shared_atomic_xor:
     case nir_intrinsic_shared_atomic_exchange:
     case nir_intrinsic_shared_atomic_comp_swap:
+   case nir_intrinsic_shared_atomic_fadd:
        visit_shared_atomic(ctx, instr);
        break;
     case nir_intrinsic_image_deref_load:
@@ -7858,15 +7988,25 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        }
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
        uint32_t mask = nir_intrinsic_swizzle_mask(instr);
-      if (dst.regClass() == v1) {
-         emit_wqm(ctx,
-                  bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
-                  dst);
+      if (instr->dest.ssa.bit_size == 1) {
+         assert(src.regClass() == bld.lm);
+         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
+         src = emit_masked_swizzle(ctx, bld, src, mask);
+         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
+         emit_wqm(ctx, tmp, dst);
+      } else if (dst.regClass() == v1b) {
+         Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
+         emit_extract_vector(ctx, tmp, 0, dst);
+      } else if (dst.regClass() == v2b) {
+         Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
+         emit_extract_vector(ctx, tmp, 0, dst);
+      } else if (dst.regClass() == v1) {
+         emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask), dst);
        } else if (dst.regClass() == v2) {
           Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
           bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
-         lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
-         hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
+         lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask));
+         hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask));
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
           emit_split_vector(ctx, dst, 2);
        } else {
@@ -8890,13 +9030,19 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
  }
  
  
-Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
+Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
  {
     Temp tmp = get_ssa_temp(ctx, ssa);
-   if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
-      return Operand(tmp.regClass());
-   else
+   if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
+      return Operand(rc);
+   } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
+      if (ctx->program->wave_size == 64)
+         return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
+      else
+         return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
+   } else {
        return Operand(tmp);
+   }
  }
  
  void visit_phi(isel_context *ctx, nir_phi_instr *instr)
@@ -8939,7 +9085,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
        if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
           continue;
        cur_pred_idx++;
-      Operand op = get_phi_operand(ctx, src.second);
+      Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
        operands[num_operands++] = op;
        num_defined += !op.isUndefined();
     }
@@ -10215,8 +10361,8 @@ static void write_tcs_tess_factors(isel_context *ctx)
        Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
        bld.mubuf(aco_opcode::buffer_store_dword,
                  /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
-                /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
-                /* disable_wqm */ false, /* glc */ true);
+                /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
+                /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
        tf_const_offset += 4;
  
        begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
@@ -10897,16 +11043,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
  {
     isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
  
-   program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
-   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
-   program->next_fp_mode.must_flush_denorms32 = false;
-   program->next_fp_mode.must_flush_denorms16_64 = false;
-   program->next_fp_mode.care_about_round32 = false;
-   program->next_fp_mode.care_about_round16_64 = false;
-   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
-   program->next_fp_mode.denorm32 = 0;
-   program->next_fp_mode.round32 = fp_round_ne;
-   program->next_fp_mode.round16_64 = fp_round_ne;
     ctx.block->fp_mode = program->next_fp_mode;
  
     add_startpgm(&ctx);