bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
}
+void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
+ aco_opcode op, uint32_t undo)
+{
+ /* multiply by 16777216 to handle denormals */
+ Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(s2)),
+ as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
+ Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
+ scaled = bld.vop1(op, bld.def(v1), scaled);
+ scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
+
+ Temp not_scaled = bld.vop1(op, bld.def(v1), val);
+
+ bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
+}
+
+void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+{
+ if (ctx->block->fp_mode.denorm32 == 0) {
+ bld.vop1(aco_opcode::v_rcp_f32, dst, val);
+ return;
+ }
+
+ emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
+}
+
+void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+{
+ if (ctx->block->fp_mode.denorm32 == 0) {
+ bld.vop1(aco_opcode::v_rsq_f32, dst, val);
+ return;
+ }
+
+ emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
+}
+
+void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+{
+ if (ctx->block->fp_mode.denorm32 == 0) {
+ bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
+ return;
+ }
+
+ emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
+}
+
+void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+{
+ if (ctx->block->fp_mode.denorm32 == 0) {
+ bld.vop1(aco_opcode::v_log_f32, dst, val);
+ return;
+ }
+
+ emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
+}
+
void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
{
if (!instr->dest.dest.is_ssa) {
}
case nir_op_frsq: {
if (dst.size() == 1) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst);
+ emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
} else if (dst.size() == 2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
} else {
case nir_op_fneg: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.size() == 1) {
+ if (ctx->block->fp_mode.must_flush_denorms32)
+ src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
} else if (dst.size() == 2) {
+ if (ctx->block->fp_mode.must_flush_denorms16_64)
+ src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
case nir_op_fabs: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.size() == 1) {
+ if (ctx->block->fp_mode.must_flush_denorms32)
+ src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
} else if (dst.size() == 2) {
+ if (ctx->block->fp_mode.must_flush_denorms16_64)
+ src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
}
case nir_op_flog2: {
if (dst.size() == 1) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst);
+ emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_frcp: {
if (dst.size() == 1) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst);
+ emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
} else if (dst.size() == 2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
} else {
}
case nir_op_fsqrt: {
if (dst.size() == 1) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst);
+ emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
} else if (dst.size() == 2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
} else {
Temp src0 = bld.tmp(v1);
Temp src1 = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
- bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
-
+ if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
+ bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
+ else
+ bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
+ bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
+ bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_fquantize2f16: {
- Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]));
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
- bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
+ if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
+ Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
+ bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
+ } else {
+ bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
+ }
break;
}
case nir_op_bfm: {
}
}
+void setup_fp_mode(isel_context *ctx, nir_shader *shader)
+{
+ Program *program = ctx->program;
+
+ unsigned float_controls = shader->info.float_controls_execution_mode;
+
+ program->next_fp_mode.preserve_signed_zero_inf_nan32 =
+ float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
+ program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
+ float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
+ FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
+
+ program->next_fp_mode.must_flush_denorms32 =
+ float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
+ program->next_fp_mode.must_flush_denorms16_64 =
+ float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
+ FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
+
+ program->next_fp_mode.care_about_round32 =
+ float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
+
+ program->next_fp_mode.care_about_round16_64 =
+ float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
+ FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
+
+ /* default to preserving fp16 and fp64 denorms, since it's free */
+ if (program->next_fp_mode.must_flush_denorms16_64)
+ program->next_fp_mode.denorm16_64 = 0;
+ else
+ program->next_fp_mode.denorm16_64 = fp_denorm_keep;
+
+ /* preserving fp32 denorms is expensive, so only do it if asked */
+ if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
+ program->next_fp_mode.denorm32 = fp_denorm_keep;
+ else
+ program->next_fp_mode.denorm32 = 0;
+
+ if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
+ program->next_fp_mode.round32 = fp_round_tz;
+ else
+ program->next_fp_mode.round32 = fp_round_ne;
+
+ if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
+ program->next_fp_mode.round16_64 = fp_round_tz;
+ else
+ program->next_fp_mode.round16_64 = fp_round_ne;
+
+ ctx->block->fp_mode = program->next_fp_mode;
+}
+
void select_program(Program *program,
unsigned shader_count,
struct nir_shader *const *shaders,
nir_shader *nir = shaders[i];
init_context(&ctx, nir);
+ setup_fp_mode(&ctx, nir);
+
if (!i) {
add_startpgm(&ctx); /* needs to be after init_context() for FS */
append_logical_start(ctx.block);
ralloc_free(ctx.divergent_vals);
}
+ program->config->float_mode = program->blocks[0].fp_mode.val;
+
append_logical_end(ctx.block);
ctx.block->kind |= block_kind_uniform;
Builder bld(ctx.program, ctx.block);
return false;
}
-void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
{
if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) {
ASSERTED bool all_const = false;
ctx.info[instr->operands[i].tempId()].set_omod4();
} else if (instr->operands[!i].constantValue() == 0x3f000000) { /* 0.5 */
ctx.info[instr->operands[i].tempId()].set_omod5();
- } else if (instr->operands[!i].constantValue() == 0x3f800000) { /* 1.0 */
+ } else if (instr->operands[!i].constantValue() == 0x3f800000 &&
+ !block.fp_mode.must_flush_denorms32) { /* 1.0 */
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp());
} else {
continue;
}
}
-bool apply_omod_clamp(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
{
/* check if we could apply omod on predecessor */
if (instr->opcode == aco_opcode::v_mul_f32) {
}
}
+ /* omod has no effect if denormals are enabled */
+ bool can_use_omod = block.fp_mode.denorm32 == 0;
+
/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 &&
can_use_VOP3(instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) {
- if(ctx.info[instr->definitions[0].tempId()].is_omod2()) {
+ if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod2()) {
to_VOP3(ctx, instr);
static_cast<VOP3A_instruction*>(instr.get())->omod = 1;
ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
- } else if (ctx.info[instr->definitions[0].tempId()].is_omod4()) {
+ } else if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod4()) {
to_VOP3(ctx, instr);
static_cast<VOP3A_instruction*>(instr.get())->omod = 2;
ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
- } else if (ctx.info[instr->definitions[0].tempId()].is_omod5()) {
+ } else if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod5()) {
to_VOP3(ctx, instr);
static_cast<VOP3A_instruction*>(instr.get())->omod = 3;
ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
// TODO: we could possibly move the whole label_instruction pass to combine_instruction:
// this would mean that we'd have to fix the instruction uses while value propagation
-void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
{
if (instr->definitions.empty() || !ctx.uses[instr->definitions[0].tempId()])
return;
if (instr->isVALU()) {
if (can_apply_sgprs(instr))
apply_sgprs(ctx, instr);
- if (apply_omod_clamp(ctx, instr))
+ if (apply_omod_clamp(ctx, block, instr))
return;
}
return;
}
/* combine mul+add -> mad */
- else if (instr->opcode == aco_opcode::v_add_f32 ||
- instr->opcode == aco_opcode::v_sub_f32 ||
- instr->opcode == aco_opcode::v_subrev_f32) {
+ else if ((instr->opcode == aco_opcode::v_add_f32 ||
+ instr->opcode == aco_opcode::v_sub_f32 ||
+ instr->opcode == aco_opcode::v_subrev_f32) &&
+ block.fp_mode.denorm32 == 0 && !block.fp_mode.preserve_signed_zero_inf_nan32) {
+ //TODO: we could use fma instead when denormals are enabled if the NIR isn't marked as precise
uint32_t uses_src0 = UINT32_MAX;
uint32_t uses_src1 = UINT32_MAX;
/* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
for (Block& block : program->blocks) {
for (aco_ptr<Instruction>& instr : block.instructions)
- label_instruction(ctx, instr);
+ label_instruction(ctx, block, instr);
}
ctx.uses = std::move(dead_code_analysis(program));
/* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
for (Block& block : program->blocks) {
for (aco_ptr<Instruction>& instr : block.instructions)
- combine_instruction(ctx, instr);
+ combine_instruction(ctx, block, instr);
}
/* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */