aco: don't split store data if it was already split into more elements

[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 0af1f1f5c155e805b078aa3de26955c3271951a7..80ea1e133d661420025877f3f024807687d4555e 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -472,7 +472,7 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
        offset = Operand(0u);
     }
  
-   unsigned num_components = dst.bytes() / component_size;
+   unsigned num_components = vec.bytes() / component_size;
     if (vec.regClass() == dst.regClass()) {
        assert(offset.constantValue() == 0);
        bld.copy(Definition(dst), vec);
@@ -480,17 +480,18 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
        return;
     }
  
-   emit_split_vector(ctx, vec, vec.bytes() / component_size);
+   emit_split_vector(ctx, vec, num_components);
     std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
     RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
  
     assert(offset.constantValue() % component_size == 0);
     unsigned skip = offset.constantValue() / component_size;
-   for (unsigned i = 0; i < num_components; i++)
-      elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
+   for (unsigned i = skip; i < num_components; i++)
+      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
  
     /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
     if (dst.type() == RegType::vgpr) {
+      num_components = dst.bytes() / component_size;
        aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
        for (unsigned i = 0; i < num_components; i++)
           create_vec->operands[i] = Operand(elems[i]);
@@ -614,6 +615,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
     sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
     sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
     sop2->definitions[0] = Definition(dst);
+   if (instr->no_unsigned_wrap)
+      sop2->definitions[0].setNUW(true);
     if (writes_scc)
        sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
     ctx->block->instructions.emplace_back(std::move(sop2));
@@ -3229,7 +3232,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
  
        /* align offset down if needed */
        Operand aligned_offset = offset;
+      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        if (need_to_align_offset) {
+         align = 4;
           Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
           if (offset.isConstant()) {
              aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
@@ -3249,7 +3254,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
                                  bld.copy(bld.def(s1), aligned_offset);
  
-      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
                            reduced_const_offset, byte_align ? Temp() : info->dst);
  
@@ -3311,7 +3315,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        if (num_tmps > 1) {
           aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
              aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
-         for (unsigned i = 0; i < num_vals; i++)
+         for (unsigned i = 0; i < num_tmps; i++)
              vec->operands[i] = Operand(tmp[i]);
           tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
           vec->definitions[0] = Definition(tmp[0]);
@@ -3510,10 +3514,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
  
     unsigned bytes_size = 0;
     aco_opcode op;
-   if (bytes_needed == 1) {
+   if (bytes_needed == 1 || align_ % 2) {
        bytes_size = 1;
        op = aco_opcode::buffer_load_ubyte;
-   } else if (bytes_needed == 2) {
+   } else if (bytes_needed == 2 || align_ % 4) {
        bytes_size = 2;
        op = aco_opcode::buffer_load_ushort;
     } else if (bytes_needed <= 4) {
@@ -3540,7 +3544,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
     mubuf->can_reorder = info->can_reorder;
     mubuf->offset = const_offset;
     mubuf->swizzled = info->swizzle_component_size != 0;
-   RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
     Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
     mubuf->definitions[0] = Definition(val);
     bld.insert(std::move(mubuf));
@@ -3549,6 +3553,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
  }
  
  static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
  
  Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
  {
@@ -3673,13 +3678,15 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
        /* use allocated_vec if possible */
        auto it = ctx->allocated_vec.find(src.id());
        if (it != ctx->allocated_vec.end()) {
-         unsigned total_size = 0;
-         for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
-            total_size += it->second[i].bytes();
-         if (total_size != src.bytes())
+         if (!it->second[0].id())
              goto split;
-
           unsigned elem_size = it->second[0].bytes();
+         assert(src.bytes() % elem_size == 0);
+
+         for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
+            if (!it->second[i].id())
+               goto split;
+         }
  
           for (unsigned i = 0; i < count; i++) {
              if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
@@ -3711,10 +3718,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
        }
     }
  
+   split:
+
     if (dst_type == RegType::sgpr)
        src = bld.as_uniform(src);
  
-   split:
     /* just split it */
     aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
     split->operands[0] = Operand(src);
@@ -5287,7 +5295,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
     if (offset != 0) // TODO check if index != 0 as well
-      index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
     Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
     Temp vec = dst;
     bool trim = false;
@@ -5373,7 +5381,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
     if (base && offset.type() == RegType::sgpr)
-      offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
     else if (base && offset.type() == RegType::vgpr)
        offset = bld.vadd32(bld.def(v1), Operand(base), offset);
  
@@ -6255,8 +6263,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
           store->operands[0] = Operand(rsrc);
           if (offsets[i]) {
-            Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                offset, Operand(offsets[i]));
+            Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+                                      offset, Operand(offsets[i]));
              store->operands[1] = Operand(off);
           } else {
              store->operands[1] = Operand(offset);
@@ -6791,6 +6799,12 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
           op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
           num_operands = 4;
           break;
+      case nir_intrinsic_shared_atomic_fadd:
+         op32 = aco_opcode::ds_add_f32;
+         op32_rtn = aco_opcode::ds_add_rtn_f32;
+         op64 = aco_opcode::num_opcodes;
+         op64_rtn = aco_opcode::num_opcodes;
+         break;
        default:
           unreachable("Unhandled shared atomic intrinsic");
     }
@@ -6841,7 +6855,7 @@ Temp get_scratch_resource(isel_context *ctx)
        scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
  
     uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
-                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
  
     if (ctx->program->chip_class >= GFX10) {
        rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
@@ -6852,9 +6866,9 @@ Temp get_scratch_resource(isel_context *ctx)
                     S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
     }
  
-   /* older generations need element size = 16 bytes. element size removed in GFX9 */
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
     if (ctx->program->chip_class <= GFX8)
-      rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
  
     return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
  }
@@ -6869,10 +6883,10 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
                          instr->dest.ssa.bit_size / 8u, rsrc};
     info.align_mul = nir_intrinsic_align_mul(instr);
     info.align_offset = nir_intrinsic_align_offset(instr);
-   info.swizzle_component_size = 16;
+   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
     info.can_reorder = false;
     info.soffset = ctx->program->scratch_offset;
-   emit_mubuf_load(ctx, bld, &info);
+   emit_scratch_load(ctx, bld, &info);
  }
  
  void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
@@ -6887,8 +6901,9 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
     unsigned write_count = 0;
     Temp write_datas[32];
     unsigned offsets[32];
+   unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
     split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
-                      16, &write_count, write_datas, offsets);
+                      swizzle_component_size, &write_count, write_datas, offsets);
  
     for (unsigned i = 0; i < write_count; i++) {
        aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
@@ -7273,6 +7288,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
        nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
        Temp private_segment_buffer = ctx->program->private_segment_buffer;
+      //TODO: bounds checking?
        if (addr.type() == RegType::sgpr) {
           Operand offset;
           if (const_addr) {
@@ -7445,6 +7461,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_shared_atomic_xor:
     case nir_intrinsic_shared_atomic_exchange:
     case nir_intrinsic_shared_atomic_comp_swap:
+   case nir_intrinsic_shared_atomic_fadd:
        visit_shared_atomic(ctx, instr);
        break;
     case nir_intrinsic_image_deref_load:
@@ -11026,16 +11043,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
  {
     isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
  
-   program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
-   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
-   program->next_fp_mode.must_flush_denorms32 = false;
-   program->next_fp_mode.must_flush_denorms16_64 = false;
-   program->next_fp_mode.care_about_round32 = false;
-   program->next_fp_mode.care_about_round16_64 = false;
-   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
-   program->next_fp_mode.denorm32 = 0;
-   program->next_fp_mode.round32 = fp_round_ne;
-   program->next_fp_mode.round16_64 = fp_round_ne;
     ctx.block->fp_mode = program->next_fp_mode;
  
     add_startpgm(&ctx);