aco: don't split store data if it was already split into more elements

[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 8e9d6bff89243037eb900ce7a621452012af6094..80ea1e133d661420025877f3f024807687d4555e 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -472,7 +472,7 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
        offset = Operand(0u);
     }
  
-   unsigned num_components = dst.bytes() / component_size;
+   unsigned num_components = vec.bytes() / component_size;
     if (vec.regClass() == dst.regClass()) {
        assert(offset.constantValue() == 0);
        bld.copy(Definition(dst), vec);
@@ -480,17 +480,18 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
        return;
     }
  
-   emit_split_vector(ctx, vec, vec.bytes() / component_size);
+   emit_split_vector(ctx, vec, num_components);
     std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
     RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
  
     assert(offset.constantValue() % component_size == 0);
     unsigned skip = offset.constantValue() / component_size;
-   for (unsigned i = 0; i < num_components; i++)
-      elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
+   for (unsigned i = skip; i < num_components; i++)
+      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
  
     /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
     if (dst.type() == RegType::vgpr) {
+      num_components = dst.bytes() / component_size;
        aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
        for (unsigned i = 0; i < num_components; i++)
           create_vec->operands[i] = Operand(elems[i]);
@@ -3231,7 +3232,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
  
        /* align offset down if needed */
        Operand aligned_offset = offset;
+      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        if (need_to_align_offset) {
+         align = 4;
           Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
           if (offset.isConstant()) {
              aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
@@ -3251,7 +3254,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
                                  bld.copy(bld.def(s1), aligned_offset);
  
-      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
                            reduced_const_offset, byte_align ? Temp() : info->dst);
  
@@ -3313,7 +3315,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        if (num_tmps > 1) {
           aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
              aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
-         for (unsigned i = 0; i < num_vals; i++)
+         for (unsigned i = 0; i < num_tmps; i++)
              vec->operands[i] = Operand(tmp[i]);
           tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
           vec->definitions[0] = Definition(tmp[0]);
@@ -3512,10 +3514,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
  
     unsigned bytes_size = 0;
     aco_opcode op;
-   if (bytes_needed == 1) {
+   if (bytes_needed == 1 || align_ % 2) {
        bytes_size = 1;
        op = aco_opcode::buffer_load_ubyte;
-   } else if (bytes_needed == 2) {
+   } else if (bytes_needed == 2 || align_ % 4) {
        bytes_size = 2;
        op = aco_opcode::buffer_load_ushort;
     } else if (bytes_needed <= 4) {
@@ -3542,7 +3544,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
     mubuf->can_reorder = info->can_reorder;
     mubuf->offset = const_offset;
     mubuf->swizzled = info->swizzle_component_size != 0;
-   RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
     Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
     mubuf->definitions[0] = Definition(val);
     bld.insert(std::move(mubuf));
@@ -3551,6 +3553,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
  }
  
  static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
  
  Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
  {
@@ -3675,13 +3678,15 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
        /* use allocated_vec if possible */
        auto it = ctx->allocated_vec.find(src.id());
        if (it != ctx->allocated_vec.end()) {
-         unsigned total_size = 0;
-         for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
-            total_size += it->second[i].bytes();
-         if (total_size != src.bytes())
+         if (!it->second[0].id())
              goto split;
-
           unsigned elem_size = it->second[0].bytes();
+         assert(src.bytes() % elem_size == 0);
+
+         for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
+            if (!it->second[i].id())
+               goto split;
+         }
  
           for (unsigned i = 0; i < count; i++) {
              if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
@@ -3713,10 +3718,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
        }
     }
  
+   split:
+
     if (dst_type == RegType::sgpr)
        src = bld.as_uniform(src);
  
-   split:
     /* just split it */
     aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
     split->operands[0] = Operand(src);
@@ -6793,6 +6799,12 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
           op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
           num_operands = 4;
           break;
+      case nir_intrinsic_shared_atomic_fadd:
+         op32 = aco_opcode::ds_add_f32;
+         op32_rtn = aco_opcode::ds_add_rtn_f32;
+         op64 = aco_opcode::num_opcodes;
+         op64_rtn = aco_opcode::num_opcodes;
+         break;
        default:
           unreachable("Unhandled shared atomic intrinsic");
     }
@@ -6843,7 +6855,7 @@ Temp get_scratch_resource(isel_context *ctx)
        scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
  
     uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
-                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
  
     if (ctx->program->chip_class >= GFX10) {
        rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
@@ -6854,9 +6866,9 @@ Temp get_scratch_resource(isel_context *ctx)
                     S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
     }
  
-   /* older generations need element size = 16 bytes. element size removed in GFX9 */
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
     if (ctx->program->chip_class <= GFX8)
-      rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
  
     return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
  }
@@ -6871,10 +6883,10 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
                          instr->dest.ssa.bit_size / 8u, rsrc};
     info.align_mul = nir_intrinsic_align_mul(instr);
     info.align_offset = nir_intrinsic_align_offset(instr);
-   info.swizzle_component_size = 16;
+   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
     info.can_reorder = false;
     info.soffset = ctx->program->scratch_offset;
-   emit_mubuf_load(ctx, bld, &info);
+   emit_scratch_load(ctx, bld, &info);
  }
  
  void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
@@ -6889,8 +6901,9 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
     unsigned write_count = 0;
     Temp write_datas[32];
     unsigned offsets[32];
+   unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
     split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
-                      16, &write_count, write_datas, offsets);
+                      swizzle_component_size, &write_count, write_datas, offsets);
  
     for (unsigned i = 0; i < write_count; i++) {
        aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
@@ -7448,6 +7461,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_shared_atomic_xor:
     case nir_intrinsic_shared_atomic_exchange:
     case nir_intrinsic_shared_atomic_comp_swap:
+   case nir_intrinsic_shared_atomic_fadd:
        visit_shared_atomic(ctx, instr);
        break;
     case nir_intrinsic_image_deref_load: