Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
- Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi);
+ Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
return;
aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
split->operands[0] = Operand(vec_src);
- std::array<Temp,4> elems;
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
for (unsigned i = 0; i < num_components; i++) {
elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
split->definitions[i] = Definition(elems[i]);
}
unsigned component_size = dst.size() / num_components;
- std::array<Temp,4> elems;
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
vec->definitions[0] = Definition(dst);
return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
} else {
assert(size <= 4);
- std::array<Temp,4> elems;
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
for (unsigned i = 0; i < size; ++i) {
elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
ctx->block->instructions.emplace_back(std::move(sop2));
}
-void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
+void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
+ bool commutative, bool swap_srcs=false, bool flush_denorms = false)
{
Builder bld(ctx->program, ctx->block);
Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
}
}
- bld.vop2(op, Definition(dst), src0, src1);
+
+ if (flush_denorms && ctx->program->chip_class < GFX9) {
+ assert(dst.size() == 1);
+ Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
+ bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
+ } else {
+ bld.vop2(op, Definition(dst), src0, src1);
+ }
}
-void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
+ bool flush_denorms = false)
{
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
src2 = as_vgpr(ctx, src2);
Builder bld(ctx->program, ctx->block);
- bld.vop3(op, Definition(dst), src0, src1, src2);
+ if (flush_denorms && ctx->program->chip_class < GFX9) {
+ assert(dst.size() == 1);
+ Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
+ bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
+ } else {
+ bld.vop3(op, Definition(dst), src0, src1, src2);
+ }
}
void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
}
void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
- aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::last_opcode, aco_opcode s64_op = aco_opcode::last_opcode)
+ aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
{
aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
- bool use_valu = s_op == aco_opcode::last_opcode ||
+ bool use_valu = s_op == aco_opcode::num_opcodes ||
divergent_vals ||
ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
aco_opcode op = use_valu ? v_op : s_op;
- assert(op != aco_opcode::last_opcode);
+ assert(op != aco_opcode::num_opcodes);
+ assert(dst.regClass() == ctx->program->lane_mask);
if (use_valu)
emit_vopc_instruction(ctx, instr, op, dst);
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4: {
- std::array<Temp,4> elems;
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
elems[i] = get_alu_src(ctx, instr->src[i]);
case nir_op_ushr: {
if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
- } else if (dst.regClass() == v2) {
+ } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
+ get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
} else if (dst.regClass() == s2) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
} else if (dst.regClass() == s1) {
case nir_op_ishl: {
if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
- } else if (dst.regClass() == v2) {
+ } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
+ get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
} else if (dst.regClass() == s2) {
case nir_op_ishr: {
if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
- } else if (dst.regClass() == v2) {
+ } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
+ get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
} else if (dst.regClass() == s2) {
}
case nir_op_fmax: {
if (dst.size() == 1) {
- emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
} else if (dst.size() == 2) {
- bld.vop3(aco_opcode::v_max_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
+ Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2),
+ get_alu_src(ctx, instr->src[0]),
+ as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
+ } else {
+ bld.vop3(aco_opcode::v_max_f64, Definition(dst),
+ get_alu_src(ctx, instr->src[0]),
+ as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ }
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_fmin: {
if (dst.size() == 1) {
- emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
} else if (dst.size() == 2) {
- bld.vop3(aco_opcode::v_min_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
+ Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2),
+ get_alu_src(ctx, instr->src[0]),
+ as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
+ } else {
+ bld.vop3(aco_opcode::v_min_f64, Definition(dst),
+ get_alu_src(ctx, instr->src[0]),
+ as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ }
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_fmax3: {
if (dst.size() == 1) {
- emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_fmin3: {
if (dst.size() == 1) {
- emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_fmed3: {
if (dst.size() == 1) {
- emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.size() == 1) {
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
+ /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
+ // TODO: confirm that this holds under any circumstances
} else if (dst.size() == 2) {
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
Temp src = get_alu_src(ctx, instr->src[0]);
aco_ptr<Instruction> norm;
if (dst.size() == 1) {
- Temp tmp;
- Operand half_pi(0x3e22f983u);
- if (src.type() == RegType::sgpr)
- tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
- else
- tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
+ Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src));
/* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
if (ctx->options->chip_class < GFX9)
mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
Temp new_exponent = bld.tmp(v1);
Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
- mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
+ if (ctx->program->chip_class >= GFX8)
+ mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
+ else
+ mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
Temp new_exponent = bld.tmp(v1);
Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
- mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
+ if (ctx->program->chip_class >= GFX8)
+ mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
+ else
+ mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
case nir_op_i2i64: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (src.regClass() == s1) {
- Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
+ Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
} else if (src.regClass() == v1) {
Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
if (src.type() == RegType::vgpr) {
assert(src.regClass() == v1 || src.regClass() == v2);
+ assert(dst.regClass() == bld.lm);
bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
Definition(dst), Operand(0u), src).def(0).setHint(vcc);
} else {
assert(src.regClass() == s1 || src.regClass() == s2);
- Temp tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
- bld.scc(bld.def(s1)), Operand(0u), src);
+ Temp tmp;
+ if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
+ tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
+ } else {
+ tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
+ bld.scc(bld.def(s1)), Operand(0u), src);
+ }
bool_to_vector_condition(ctx, tmp, dst);
}
break;
*/
f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
- Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), f32, smallest);
+ Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
cmp_res = vop3->definitions[0].getTemp();
}
if (instr->src[0].src.ssa->bit_size == 1)
emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
else
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
+ ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
break;
}
case nir_op_ine: {
if (instr->src[0].src.ssa->bit_size == 1)
emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
else
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
+ ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
break;
}
case nir_op_ult: {
unsigned bytes_read = 0;
unsigned result_size = 0;
unsigned total_bytes = num_components * elem_size_bytes;
- std::array<Temp, 4> result;
+ std::array<Temp, NIR_MAX_VEC_COMPONENTS> result;
while (bytes_read < total_bytes) {
unsigned todo = total_bytes - bytes_read;
aco_opcode op;
if (dst.type() == RegType::vgpr || (ctx->options->chip_class < GFX8 && !readonly)) {
- if (ctx->options->chip_class < GFX8)
- offset = as_vgpr(ctx, offset);
-
Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
unsigned const_offset = 0;
Temp vec = bld.tmp(RegType::vgpr, dst.size());
instr->definitions[0] = Definition(vec);
bld.insert(std::move(instr));
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
+ expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
} else {
instr->definitions[0] = Definition(dst);
bld.insert(std::move(instr));
+ emit_split_vector(ctx, dst, num_components);
}
} else {
switch (num_bytes) {
} else {
bld.insert(std::move(load));
}
-
+ emit_split_vector(ctx, dst, num_components);
}
- emit_split_vector(ctx, dst, num_components);
}
void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
if (ctx->options->chip_class >= GFX10) {
desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(3) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
S_008F0C_RESOURCE_LEVEL(1);
} else {
desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
if (ctx->options->chip_class >= GFX10) {
desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(3) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
S_008F0C_RESOURCE_LEVEL(1);
} else {
desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
}
+ if (instr->intrinsic == nir_intrinsic_image_deref_load ||
+ instr->intrinsic == nir_intrinsic_image_deref_store) {
+ int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
+ bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
+
+ if (!level_zero)
+ coords.emplace_back(Operand(get_ssa_temp(ctx, instr->src[lod_index].ssa)));
+ }
+
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
for (unsigned i = 0; i < coords.size(); i++)
vec->operands[i] = coords[i];
else
tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
- aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
+ bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
+ aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
+
+ aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 2, 1)};
load->operands[0] = Operand(coords);
load->operands[1] = Operand(resource);
load->definitions[0] = Definition(tmp);
Temp coords = get_image_coords(ctx, instr, type);
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
- aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
+ bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
+ aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
+
+ aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 4, 0)};
store->operands[0] = Operand(coords);
store->operands[1] = Operand(resource);
store->operands[2] = Operand(s4);
void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
{
if (in_elements && ctx->options->chip_class == GFX8) {
+ /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
Builder bld(ctx->program, ctx->block);
+ Temp size = emit_extract_vector(ctx, desc, 2, s1);
+
+ Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
+ size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
+
Temp stride = emit_extract_vector(ctx, desc, 1, s1);
stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
- stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
- stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
- Temp size = emit_extract_vector(ctx, desc, 2, s1);
- size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
-
- Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
- res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
-
- // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
- /* idea
- * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
- * in case 12 (or 3?), we have to divide by 3:
- * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
- * use v_mul_hi_u32 with magic number to divide
- * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
- * disable v_skip
- * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
- */
+ Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
+ size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
+ Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
+ bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
+ size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
+ if (dst.type() == RegType::vgpr)
+ bld.copy(Definition(dst), shr_dst);
+
+ /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
} else {
emit_extract_vector(ctx, desc, 2, dst);
}
aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
mimg->operands[0] = Operand(lod);
mimg->operands[1] = Operand(resource);
- unsigned& dmask = mimg->dmask;
+ uint8_t& dmask = mimg->dmask;
mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
mimg->da = glsl_sampler_type_is_array(type);
Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
unsigned writemask = nir_intrinsic_write_mask(instr);
-
- Temp offset;
- if (ctx->options->chip_class < GFX8)
- offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
- else
- offset = get_ssa_temp(ctx, instr->src[2].ssa);
+ Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
get_ssa_temp(ctx, instr->src[3].ssa), data);
- Temp offset;
- if (ctx->options->chip_class < GFX8)
- offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
- else
- offset = get_ssa_temp(ctx, instr->src[1].ssa);
-
+ Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
if (ctx->program->chip_class >= GFX10) {
rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(3) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
S_008F0C_RESOURCE_LEVEL(1);
} else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
- if (ctx->program->wave_size == 64)
+
+ if (ctx->program->chip_class <= GFX7)
+ tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
+ else if (ctx->program->wave_size == 64)
tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
else
tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
case nir_intrinsic_get_buffer_size:
visit_get_buffer_size(ctx, instr);
break;
- case nir_intrinsic_barrier: {
+ case nir_intrinsic_control_barrier: {
unsigned* bsize = ctx->program->info->cs.block_size;
unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
if (workgroup_size > ctx->program->wave_size)
case nir_intrinsic_memory_barrier_shared:
emit_memory_barrier(ctx, instr);
break;
+ case nir_intrinsic_memory_barrier_tcs_patch:
+ break;
case nir_intrinsic_load_num_work_groups: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
} else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
assert(src.regClass() == bld.lm);
Temp tmp;
- if (ctx->program->wave_size == 64)
+ if (ctx->program->chip_class <= GFX7)
+ tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
+ else if (ctx->program->wave_size == 64)
tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
else
tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
}
case nir_intrinsic_demote:
bld.pseudo(aco_opcode::p_demote_to_helper);
+
+ if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+ ctx->cf_info.exec_potentially_empty = true;
ctx->block->kind |= block_kind_uses_demote;
ctx->program->needs_exact = true;
break;
assert(src.regClass() == bld.lm);
Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
bld.pseudo(aco_opcode::p_demote_to_helper, cond);
+
+ if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+ ctx->cf_info.exec_potentially_empty = true;
ctx->block->kind |= block_kind_uses_demote;
ctx->program->needs_exact = true;
break;
Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
- Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
+ Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
// TODO: scalarize linear phis on divergent ifs
bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
- std::array<Temp, 4> new_vec;
+ std::array<Temp, NIR_MAX_VEC_COMPONENTS> new_vec;
for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
Operand src = operands[i];
if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
else
exp->operands[i] = Operand(v1);
}
- exp->valid_mask = false;
+ /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+ * Setting valid_mask=1 prevents it and has no other effect.
+ */
+ exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
exp->done = false;
exp->compressed = false;
if (is_pos)
exp->enabled_mask |= 0x4;
}
}
- exp->valid_mask = false;
+ exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
exp->done = false;
exp->compressed = false;
exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
Temp tid = emit_mbcnt(ctx, bld.def(v1));
- Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
+ Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
if_context ic;
begin_divergent_if_then(ctx, &ic, can_emit);
/* Split all arguments except for the first (ring_offsets) and the last
* (exec) so that the dead channels don't stay live throughout the program.
*/
- for (unsigned i = 1; i < startpgm->definitions.size() - 1; i++) {
+ for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
if (startpgm->definitions[i].regClass().size() > 1) {
emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
startpgm->definitions[i].regClass().size());