Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
- Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi);
+ Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
aco_opcode op = use_valu ? v_op : s_op;
assert(op != aco_opcode::num_opcodes);
+ assert(dst.regClass() == ctx->program->lane_mask);
if (use_valu)
emit_vopc_instruction(ctx, instr, op, dst);
if (src.type() == RegType::vgpr) {
assert(src.regClass() == v1 || src.regClass() == v2);
+ assert(dst.regClass() == bld.lm);
bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
Definition(dst), Operand(0u), src).def(0).setHint(vcc);
} else {
*/
f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
- Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), f32, smallest);
+ Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
cmp_res = vop3->definitions[0].getTemp();
}
coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
}
+ if (instr->intrinsic == nir_intrinsic_image_deref_load ||
+ instr->intrinsic == nir_intrinsic_image_deref_store) {
+ int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
+ bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
+
+ if (!level_zero)
+ coords.emplace_back(Operand(get_ssa_temp(ctx, instr->src[lod_index].ssa)));
+ }
+
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
for (unsigned i = 0; i < coords.size(); i++)
vec->operands[i] = coords[i];
else
tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
- aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
+ bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
+ aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
+
+ aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 2, 1)};
load->operands[0] = Operand(coords);
load->operands[1] = Operand(resource);
load->definitions[0] = Definition(tmp);
Temp coords = get_image_coords(ctx, instr, type);
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
- aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
+ bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
+ aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
+
+ aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 4, 0)};
store->operands[0] = Operand(coords);
store->operands[1] = Operand(resource);
store->operands[2] = Operand(s4);
void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
{
if (in_elements && ctx->options->chip_class == GFX8) {
+ /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
Builder bld(ctx->program, ctx->block);
+ Temp size = emit_extract_vector(ctx, desc, 2, s1);
+
+ Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
+ size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
+
Temp stride = emit_extract_vector(ctx, desc, 1, s1);
stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
- stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
- stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
- Temp size = emit_extract_vector(ctx, desc, 2, s1);
- size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
-
- Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
- res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
-
- // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
- /* idea
- * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
- * in case 12 (or 3?), we have to divide by 3:
- * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
- * use v_mul_hi_u32 with magic number to divide
- * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
- * disable v_skip
- * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
- */
+ Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
+ size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
+
+ Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
+ bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
+ size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
+ if (dst.type() == RegType::vgpr)
+ bld.copy(Definition(dst), shr_dst);
+ /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
} else {
emit_extract_vector(ctx, desc, 2, dst);
}
aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
mimg->operands[0] = Operand(lod);
mimg->operands[1] = Operand(resource);
- unsigned& dmask = mimg->dmask;
+ uint8_t& dmask = mimg->dmask;
mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
mimg->da = glsl_sampler_type_is_array(type);
case nir_intrinsic_get_buffer_size:
visit_get_buffer_size(ctx, instr);
break;
- case nir_intrinsic_barrier: {
+ case nir_intrinsic_control_barrier: {
unsigned* bsize = ctx->program->info->cs.block_size;
unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
if (workgroup_size > ctx->program->wave_size)
case nir_intrinsic_memory_barrier_shared:
emit_memory_barrier(ctx, instr);
break;
+ case nir_intrinsic_memory_barrier_tcs_patch:
+ break;
case nir_intrinsic_load_num_work_groups: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
}
case nir_intrinsic_demote:
bld.pseudo(aco_opcode::p_demote_to_helper);
+
+ if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+ ctx->cf_info.exec_potentially_empty = true;
ctx->block->kind |= block_kind_uses_demote;
ctx->program->needs_exact = true;
break;
assert(src.regClass() == bld.lm);
Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
bld.pseudo(aco_opcode::p_demote_to_helper, cond);
+
+ if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+ ctx->cf_info.exec_potentially_empty = true;
ctx->block->kind |= block_kind_uses_demote;
ctx->program->needs_exact = true;
break;
Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
- Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
+ Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
else
exp->operands[i] = Operand(v1);
}
- exp->valid_mask = false;
+ /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+ * Setting valid_mask=1 prevents it and has no other effect.
+ */
+ exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
exp->done = false;
exp->compressed = false;
if (is_pos)
exp->enabled_mask |= 0x4;
}
}
- exp->valid_mask = false;
+ exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
exp->done = false;
exp->compressed = false;
exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
Temp tid = emit_mbcnt(ctx, bld.def(v1));
- Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
+ Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
if_context ic;
begin_divergent_if_then(ctx, &ic, can_emit);
/* Split all arguments except for the first (ring_offsets) and the last
* (exec) so that the dead channels don't stay live throughout the program.
*/
- for (unsigned i = 1; i < startpgm->definitions.size() - 1; i++) {
+ for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
if (startpgm->definitions[i].regClass().size() > 1) {
emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
startpgm->definitions[i].regClass().size());