offset = Operand(0u);
}
- unsigned num_components = dst.bytes() / component_size;
+ unsigned num_components = vec.bytes() / component_size;
if (vec.regClass() == dst.regClass()) {
assert(offset.constantValue() == 0);
bld.copy(Definition(dst), vec);
return;
}
- emit_split_vector(ctx, vec, vec.bytes() / component_size);
+ emit_split_vector(ctx, vec, num_components);
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
assert(offset.constantValue() % component_size == 0);
unsigned skip = offset.constantValue() / component_size;
- for (unsigned i = 0; i < num_components; i++)
- elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
+ for (unsigned i = skip; i < num_components; i++)
+ elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
/* if dst is vgpr - split the src and create a shrunk version according to the mask. */
if (dst.type() == RegType::vgpr) {
+ num_components = dst.bytes() / component_size;
aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
for (unsigned i = 0; i < num_components; i++)
create_vec->operands[i] = Operand(elems[i]);
sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
sop2->definitions[0] = Definition(dst);
+ if (instr->no_unsigned_wrap)
+ sop2->definitions[0].setNUW(true);
if (writes_scc)
sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
ctx->block->instructions.emplace_back(std::move(sop2));
/* align offset down if needed */
Operand aligned_offset = offset;
+ unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
if (need_to_align_offset) {
+ align = 4;
Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
if (offset.isConstant()) {
aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
bld.copy(bld.def(s1), aligned_offset);
- unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
reduced_const_offset, byte_align ? Temp() : info->dst);
if (num_tmps > 1) {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
- for (unsigned i = 0; i < num_vals; i++)
+ for (unsigned i = 0; i < num_tmps; i++)
vec->operands[i] = Operand(tmp[i]);
tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
vec->definitions[0] = Definition(tmp[0]);
unsigned bytes_size = 0;
aco_opcode op;
- if (bytes_needed == 1) {
+ if (bytes_needed == 1 || align_ % 2) {
bytes_size = 1;
op = aco_opcode::buffer_load_ubyte;
- } else if (bytes_needed == 2) {
+ } else if (bytes_needed == 2 || align_ % 4) {
bytes_size = 2;
op = aco_opcode::buffer_load_ushort;
} else if (bytes_needed <= 4) {
mubuf->can_reorder = info->can_reorder;
mubuf->offset = const_offset;
mubuf->swizzled = info->swizzle_component_size != 0;
- RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+ RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
mubuf->definitions[0] = Definition(val);
bld.insert(std::move(mubuf));
}
static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
{
/* use allocated_vec if possible */
auto it = ctx->allocated_vec.find(src.id());
if (it != ctx->allocated_vec.end()) {
- unsigned total_size = 0;
- for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
- total_size += it->second[i].bytes();
- if (total_size != src.bytes())
+ if (!it->second[0].id())
goto split;
-
unsigned elem_size = it->second[0].bytes();
+ assert(src.bytes() % elem_size == 0);
+
+ for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
+ if (!it->second[i].id())
+ goto split;
+ }
for (unsigned i = 0; i < count; i++) {
if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
}
}
+ split:
+
if (dst_type == RegType::sgpr)
src = bld.as_uniform(src);
- split:
/* just split it */
aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
split->operands[0] = Operand(src);
Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
if (offset != 0) // TODO check if index != 0 as well
- index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+ index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
Temp vec = dst;
bool trim = false;
unreachable("unimplemented or forbidden load_push_constant.");
}
- bld.smem(op, Definition(vec), ptr, index);
+ static_cast<SMEM_instruction*>(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true;
if (!aligned) {
Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
if (base && offset.type() == RegType::sgpr)
- offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+ offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
else if (base && offset.type() == RegType::vgpr)
offset = bld.vadd32(bld.def(v1), Operand(base), offset);
aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
store->operands[0] = Operand(rsrc);
if (offsets[i]) {
- Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
- offset, Operand(offsets[i]));
+ Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+ offset, Operand(offsets[i]));
store->operands[1] = Operand(off);
} else {
store->operands[1] = Operand(offset);
op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
num_operands = 4;
break;
+ case nir_intrinsic_shared_atomic_fadd:
+ op32 = aco_opcode::ds_add_f32;
+ op32_rtn = aco_opcode::ds_add_rtn_f32;
+ op64 = aco_opcode::num_opcodes;
+ op64_rtn = aco_opcode::num_opcodes;
+ break;
default:
unreachable("Unhandled shared atomic intrinsic");
}
scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
- S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+ S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
if (ctx->program->chip_class >= GFX10) {
rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
}
- /* older generations need element size = 16 bytes. element size removed in GFX9 */
+ /* older generations need element size = 4 bytes. element size removed in GFX9 */
if (ctx->program->chip_class <= GFX8)
- rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+ rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
}
instr->dest.ssa.bit_size / 8u, rsrc};
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
- info.swizzle_component_size = 16;
+ info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
info.can_reorder = false;
info.soffset = ctx->program->scratch_offset;
- emit_mubuf_load(ctx, bld, &info);
+ emit_scratch_load(ctx, bld, &info);
}
void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
unsigned write_count = 0;
Temp write_datas[32];
unsigned offsets[32];
+ unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
- 16, &write_count, write_datas, offsets);
+ swizzle_component_size, &write_count, write_datas, offsets);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
Temp private_segment_buffer = ctx->program->private_segment_buffer;
+ //TODO: bounds checking?
if (addr.type() == RegType::sgpr) {
Operand offset;
if (const_addr) {
case nir_intrinsic_shared_atomic_xor:
case nir_intrinsic_shared_atomic_exchange:
case nir_intrinsic_shared_atomic_comp_swap:
+ case nir_intrinsic_shared_atomic_fadd:
visit_shared_atomic(ctx, instr);
break;
case nir_intrinsic_image_deref_load:
{
isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
- program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
- program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
- program->next_fp_mode.must_flush_denorms32 = false;
- program->next_fp_mode.must_flush_denorms16_64 = false;
- program->next_fp_mode.care_about_round32 = false;
- program->next_fp_mode.care_about_round16_64 = false;
- program->next_fp_mode.denorm16_64 = fp_denorm_keep;
- program->next_fp_mode.denorm32 = 0;
- program->next_fp_mode.round32 = fp_round_ne;
- program->next_fp_mode.round16_64 = fp_round_ne;
ctx.block->fp_mode = program->next_fp_mode;
add_startpgm(&ctx);