X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_optimizer.cpp;h=0a43964457a7421700ec2fdf33eebeaa703c84df;hb=4e30191c9d3e5cdb1b65d4563f2b74e9bfdcf243;hp=58d22910150a2ee1456796c19e326994b3c63382;hpb=3d6f67950d91de1dd50b096de144e504a89ea21d;p=mesa.git diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 58d22910150..0a43964457a 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -34,6 +34,27 @@ namespace aco { +#ifndef NDEBUG +void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr) +{ + if (cond) { + char *out; + size_t outsize; + FILE *memf = open_memstream(&out, &outsize); + + fprintf(memf, "%s: ", msg); + aco_print_instr(instr, memf); + fclose(memf); + + aco_perfwarn(program, out); + free(out); + + if (debug_flags & DEBUG_PERFWARN) + exit(1); + } +} +#endif + /** * The optimizer works in 4 phases: * (1) The first pass collects information for each ssa-def, @@ -52,7 +73,7 @@ namespace aco { struct mad_info { aco_ptr add_instr; uint32_t mul_temp_id; - uint32_t literal_idx; + uint16_t literal_idx; bool check_literal; mad_info(aco_ptr instr, uint32_t id) @@ -83,7 +104,7 @@ enum Label { label_add_sub = 1 << 17, label_bitwise = 1 << 18, label_minmax = 1 << 19, - label_fcmp = 1 << 20, + label_vopc = 1 << 20, label_uniform_bool = 1 << 21, label_constant_64bit = 1 << 22, label_uniform_bitwise = 1 << 23, @@ -94,19 +115,19 @@ enum Label { label_constant_16bit = 1 << 29, }; -static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | - label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_fcmp; -static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | +static constexpr uint64_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | + label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_vopc; +static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | label_omod2 | label_omod4 | label_omod5 | label_clamp | label_scc_invert | label_b2i; -static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal | label_mad; +static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal; struct ssa_info { - uint32_t val; + uint64_t label; union { + uint32_t val; Temp temp; Instruction* instr; }; - uint32_t label; ssa_info() : label(0) {} @@ -116,18 +137,21 @@ struct ssa_info { * (indicating the defining instruction), there is no need to clear * any other instr labels. */ if (new_label & instr_labels) - label &= ~temp_labels; /* instr and temp alias */ + label &= ~(temp_labels | val_labels); /* instr, temp and val alias */ if (new_label & temp_labels) { label &= ~temp_labels; - label &= ~instr_labels; /* instr and temp alias */ + label &= ~(instr_labels | val_labels); /* instr, temp and val alias */ } uint32_t const_labels = label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit; - if (new_label & const_labels) + if (new_label & const_labels) { label &= ~val_labels | const_labels; - else if (new_label & val_labels) + label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */ + } else if (new_label & val_labels) { label &= ~val_labels; + label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */ + } label |= new_label; } @@ -277,7 +301,7 @@ struct ssa_info { void set_mad(Instruction* mad, uint32_t mad_info_idx) { add_label(label_mad); - val = mad_info_idx; + mad->pass_flags = mad_info_idx; instr = mad; } @@ -427,15 +451,15 @@ struct ssa_info { return label & label_minmax; } - void set_fcmp(Instruction *fcmp_instr) + void set_vopc(Instruction *vopc_instr) { - add_label(label_fcmp); - instr = fcmp_instr; + add_label(label_vopc); + instr = vopc_instr; } - bool is_fcmp() + bool is_vopc() { - return label & label_fcmp; + return label & label_vopc; } void set_scc_needed() @@ -502,6 +526,18 @@ struct opt_ctx { std::vector uses; }; +struct CmpInfo { + aco_opcode ordered; + aco_opcode unordered; + aco_opcode ordered_swapped; + aco_opcode unordered_swapped; + aco_opcode inverse; + aco_opcode f32; + unsigned size; +}; + +ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info); + bool can_swap_operands(aco_ptr& instr) { if (instr->operands[0].isConstant() || @@ -509,35 +545,63 @@ bool can_swap_operands(aco_ptr& instr) return false; switch (instr->opcode) { + case aco_opcode::v_add_u32: + case aco_opcode::v_add_co_u32: + case aco_opcode::v_add_co_u32_e64: + case aco_opcode::v_add_i32: + case aco_opcode::v_add_f16: case aco_opcode::v_add_f32: + case aco_opcode::v_mul_f16: case aco_opcode::v_mul_f32: case aco_opcode::v_or_b32: case aco_opcode::v_and_b32: case aco_opcode::v_xor_b32: + case aco_opcode::v_max_f16: case aco_opcode::v_max_f32: + case aco_opcode::v_min_f16: case aco_opcode::v_min_f32: case aco_opcode::v_max_i32: case aco_opcode::v_min_i32: case aco_opcode::v_max_u32: case aco_opcode::v_min_u32: - case aco_opcode::v_cmp_eq_f32: - case aco_opcode::v_cmp_lg_f32: + case aco_opcode::v_max_i16: + case aco_opcode::v_min_i16: + case aco_opcode::v_max_u16: + case aco_opcode::v_min_u16: + case aco_opcode::v_max_i16_e64: + case aco_opcode::v_min_i16_e64: + case aco_opcode::v_max_u16_e64: + case aco_opcode::v_min_u16_e64: + return true; + case aco_opcode::v_sub_f16: + instr->opcode = aco_opcode::v_subrev_f16; return true; case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true; - case aco_opcode::v_cmp_lt_f32: - instr->opcode = aco_opcode::v_cmp_gt_f32; + case aco_opcode::v_sub_co_u32: + instr->opcode = aco_opcode::v_subrev_co_u32; return true; - case aco_opcode::v_cmp_ge_f32: - instr->opcode = aco_opcode::v_cmp_le_f32; + case aco_opcode::v_sub_u16: + instr->opcode = aco_opcode::v_subrev_u16; return true; - case aco_opcode::v_cmp_lt_i32: - instr->opcode = aco_opcode::v_cmp_gt_i32; + case aco_opcode::v_sub_u32: + instr->opcode = aco_opcode::v_subrev_u32; return true; - default: + default: { + CmpInfo info; + get_cmp_info(instr->opcode, &info); + if (info.ordered == instr->opcode) { + instr->opcode = info.ordered_swapped; + return true; + } + if (info.unordered == instr->opcode) { + instr->opcode = info.unordered_swapped; + return true; + } return false; } + } } bool can_use_VOP3(opt_ctx& ctx, const aco_ptr& instr) @@ -672,7 +736,7 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) return true; } -bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset) +bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow) { Operand op = instr->operands[op_index]; @@ -694,6 +758,8 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp default: return false; } + if (prevent_overflow && !add_instr->definitions[0].isNUW()) + return false; if (add_instr->usesModifiers()) return false; @@ -711,7 +777,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp continue; uint32_t offset2 = 0; - if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) { + if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) { *offset += offset2; } else { *base = add_instr->operands[!i].getTemp(); @@ -758,7 +824,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) ASSERTED bool all_const = false; for (Operand& op : instr->operands) all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32)); - perfwarn(all_const, "All instruction operands are constant", instr.get()); + perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get()); } for (unsigned i = 0; i < instr->operands.size(); i++) @@ -860,7 +926,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) unsigned bits = get_operand_size(instr, i); if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i)) { Operand op = get_constant_op(ctx, info, bits); - perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); + perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) { instr->operands[i] = op; continue; @@ -884,6 +950,15 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) while (info.is_temp()) info = ctx.info[info.temp.id()]; + /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr + * overflow for scratch accesses works only on GFX9+ and saddr overflow + * never works. Since swizzling is the only thing that separates + * scratch accesses and other accesses and swizzling changing how + * addressing works significantly, this probably applies to swizzled + * MUBUF accesses. */ + bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9; + bool saddr_prevent_overflow = mubuf->swizzled; + if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) { assert(!mubuf->idxen); instr->operands[1] = Operand(v1); @@ -894,12 +969,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->operands[2] = Operand((uint32_t) 0); mubuf->offset += info.val; continue; - } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) { + } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) && + base.regClass() == v1 && mubuf->offset + offset < 4096) { assert(!mubuf->idxen); instr->operands[1].setTemp(base); mubuf->offset += offset; continue; - } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) { + } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) && + base.regClass() == s1 && mubuf->offset + offset < 4096) { instr->operands[i].setTemp(base); mubuf->offset += offset; continue; @@ -914,7 +991,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) uint32_t offset; bool has_usable_ds_offset = ctx.program->chip_class >= GFX7; if (has_usable_ds_offset && - i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && + i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && base.regClass() == instr->operands[i].regClass() && instr->opcode != aco_opcode::ds_swizzle_b32) { if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 || @@ -944,13 +1021,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) SMEM_instruction *smem = static_cast(instr.get()); Temp base; uint32_t offset; + bool prevent_overflow = smem->operands[0].size() > 2 || smem->prevent_overflow; if (i == 1 && info.is_constant_or_literal(32) && ((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) || (ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) || (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) { instr->operands[i] = Operand(info.val); continue; - } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { + } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4); if (soe && (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) || @@ -969,8 +1047,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) new_instr->operands.back() = Operand(base); if (!smem->definitions.empty()) new_instr->definitions[0] = smem->definitions[0]; - new_instr->can_reorder = smem->can_reorder; - new_instr->barrier = smem->barrier; + new_instr->sync = smem->sync; new_instr->glc = smem->glc; new_instr->dlc = smem->dlc; new_instr->nv = smem->nv; @@ -995,6 +1072,11 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) if (instr->definitions.empty()) return; + if ((uint16_t) instr->format & (uint16_t) Format::VOPC) { + ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get()); + return; + } + switch (instr->opcode) { case aco_opcode::p_create_vector: { bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() && @@ -1034,8 +1116,20 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) break; } case aco_opcode::p_split_vector: { - if (!ctx.info[instr->operands[0].tempId()].is_vec()) + ssa_info& info = ctx.info[instr->operands[0].tempId()]; + + if (info.is_constant_or_literal(32)) { + uint32_t val = info.val; + for (Definition def : instr->definitions) { + uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u); + ctx.info[def.tempId()].set_constant(ctx.program->chip_class, val & mask); + val >>= def.bytes() * 8u; + } break; + } else if (!info.is_vec()) { + break; + } + Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; unsigned split_offset = 0; unsigned vec_offset = 0; @@ -1060,13 +1154,20 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) break; } case aco_opcode::p_extract_vector: { /* mov */ - if (!ctx.info[instr->operands[0].tempId()].is_vec()) + ssa_info& info = ctx.info[instr->operands[0].tempId()]; + const unsigned index = instr->operands[1].constantValue(); + const unsigned dst_offset = index * instr->definitions[0].bytes(); + + if (info.is_constant_or_literal(32)) { + uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u); + ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, (info.val >> (dst_offset * 8u)) & mask); break; + } else if (!info.is_vec()) { + break; + } /* check if we index directly into a vector element */ - Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; - const unsigned index = instr->operands[1].constantValue(); - const unsigned dst_offset = index * instr->definitions[0].bytes(); + Instruction* vec = info.instr; unsigned offset = 0; for (const Operand& op : vec->operands) { @@ -1293,6 +1394,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); break; + } else if (ctx.info[instr->operands[0].tempId()].is_vopc()) { + Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr; + /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus already produces the same result */ + if (vopc_instr->pass_flags == instr->pass_flags) { + assert(instr->pass_flags > 0); + ctx.info[instr->definitions[0].tempId()].set_temp(vopc_instr->definitions[0].getTemp()); + break; + } } } /* fallthrough */ @@ -1325,22 +1434,6 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) case aco_opcode::v_max_i16: ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get()); break; - case aco_opcode::v_cmp_lt_f32: - case aco_opcode::v_cmp_eq_f32: - case aco_opcode::v_cmp_le_f32: - case aco_opcode::v_cmp_gt_f32: - case aco_opcode::v_cmp_lg_f32: - case aco_opcode::v_cmp_ge_f32: - case aco_opcode::v_cmp_o_f32: - case aco_opcode::v_cmp_u_f32: - case aco_opcode::v_cmp_nge_f32: - case aco_opcode::v_cmp_nlg_f32: - case aco_opcode::v_cmp_ngt_f32: - case aco_opcode::v_cmp_nle_f32: - case aco_opcode::v_cmp_neq_f32: - case aco_opcode::v_cmp_nlt_f32: - ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get()); - break; case aco_opcode::s_cselect_b64: case aco_opcode::s_cselect_b32: if (instr->operands[0].constantEquals((unsigned) -1) && @@ -1365,24 +1458,51 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } } -ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode *unordered, aco_opcode *inverse) +ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info) { - *ordered = *unordered = op; + info->ordered = aco_opcode::num_opcodes; + info->unordered = aco_opcode::num_opcodes; + info->ordered_swapped = aco_opcode::num_opcodes; + info->unordered_swapped = aco_opcode::num_opcodes; switch (op) { - #define CMP(ord, unord) \ - case aco_opcode::v_cmp_##ord##_f32:\ - case aco_opcode::v_cmp_n##unord##_f32:\ - *ordered = aco_opcode::v_cmp_##ord##_f32;\ - *unordered = aco_opcode::v_cmp_n##unord##_f32;\ - *inverse = op == aco_opcode::v_cmp_n##unord##_f32 ? aco_opcode::v_cmp_##unord##_f32 : aco_opcode::v_cmp_n##ord##_f32;\ + #define CMP2(ord, unord, ord_swap, unord_swap, sz) \ + case aco_opcode::v_cmp_##ord##_f##sz:\ + case aco_opcode::v_cmp_n##unord##_f##sz:\ + info->ordered = aco_opcode::v_cmp_##ord##_f##sz;\ + info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;\ + info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;\ + info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;\ + info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz : aco_opcode::v_cmp_n##ord##_f##sz;\ + info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 : aco_opcode::v_cmp_n##unord##_f32;\ + info->size = sz;\ return true; - CMP(lt, /*n*/ge) - CMP(eq, /*n*/lg) - CMP(le, /*n*/gt) - CMP(gt, /*n*/le) - CMP(lg, /*n*/eq) - CMP(ge, /*n*/lt) + #define CMP(ord, unord, ord_swap, unord_swap) \ + CMP2(ord, unord, ord_swap, unord_swap, 16)\ + CMP2(ord, unord, ord_swap, unord_swap, 32)\ + CMP2(ord, unord, ord_swap, unord_swap, 64) + CMP(lt, /*n*/ge, gt, /*n*/le) + CMP(eq, /*n*/lg, eq, /*n*/lg) + CMP(le, /*n*/gt, ge, /*n*/lt) + CMP(gt, /*n*/le, lt, /*n*/le) + CMP(lg, /*n*/eq, lg, /*n*/eq) + CMP(ge, /*n*/lt, le, /*n*/gt) #undef CMP + #undef CMP2 + #define ORD_TEST(sz) \ + case aco_opcode::v_cmp_u_f##sz:\ + info->f32 = aco_opcode::v_cmp_u_f32;\ + info->inverse = aco_opcode::v_cmp_o_f##sz;\ + info->size = sz;\ + return true;\ + case aco_opcode::v_cmp_o_f##sz:\ + info->f32 = aco_opcode::v_cmp_o_f32;\ + info->inverse = aco_opcode::v_cmp_u_f##sz;\ + info->size = sz;\ + return true; + ORD_TEST(16) + ORD_TEST(32) + ORD_TEST(64) + #undef ORD_TEST default: return false; } @@ -1390,26 +1510,38 @@ ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode * aco_opcode get_ordered(aco_opcode op) { - aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? ordered : aco_opcode::num_opcodes; + CmpInfo info; + return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes; } aco_opcode get_unordered(aco_opcode op) { - aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? unordered : aco_opcode::num_opcodes; + CmpInfo info; + return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes; } aco_opcode get_inverse(aco_opcode op) { - aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse) ? inverse : aco_opcode::num_opcodes; + CmpInfo info; + return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes; +} + +aco_opcode get_f32_cmp(aco_opcode op) +{ + CmpInfo info; + return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes; +} + +unsigned get_cmp_bitsize(aco_opcode op) +{ + CmpInfo info; + return get_cmp_info(op, &info) ? info.size : 0; } bool is_cmp(aco_opcode op) { - aco_opcode ordered, unordered, inverse; - return get_cmp_info(op, &ordered, &unordered, &inverse); + CmpInfo info; + return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes; } unsigned original_temp_id(opt_ctx &ctx, Temp tmp) @@ -1465,14 +1597,18 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) Instruction *op_instr[2]; Temp op[2]; + unsigned bitsize = 0; for (unsigned i = 0; i < 2; i++) { op_instr[i] = follow_operand(ctx, instr->operands[i], true); if (!op_instr[i]) return false; aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; + unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode); - if (op_instr[i]->opcode != expected_cmp) + if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp) + return false; + if (bitsize && op_bitsize != bitsize) return false; if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp()) return false; @@ -1492,6 +1628,7 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) return false; op[i] = op1; + bitsize = op_bitsize; } if (op[1].type() == RegType::sgpr) @@ -1505,7 +1642,18 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) decrease_uses(ctx, op_instr[0]); decrease_uses(ctx, op_instr[1]); - aco_opcode new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; + aco_opcode new_op = aco_opcode::num_opcodes; + switch (bitsize) { + case 16: + new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; + break; + case 32: + new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; + break; + case 64: + new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; + break; + } Instruction *new_instr; if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) { VOP3A_instruction *vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); @@ -1523,7 +1671,7 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) new_instr->definitions[0] = instr->definitions[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); instr.reset(new_instr); @@ -1547,12 +1695,12 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) if (!nan_test || !cmp) return false; - if (cmp->opcode == expected_nan_test) + if (get_f32_cmp(cmp->opcode) == expected_nan_test) std::swap(nan_test, cmp); - else if (nan_test->opcode != expected_nan_test) + else if (get_f32_cmp(nan_test->opcode) != expected_nan_test) return false; - if (!is_cmp(cmp->opcode)) + if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode)) return false; if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) @@ -1593,7 +1741,7 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) new_instr->definitions[0] = instr->definitions[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); instr.reset(new_instr); @@ -1618,12 +1766,12 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in return false; aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; - if (cmp->opcode == expected_nan_test) + if (get_f32_cmp(cmp->opcode) == expected_nan_test) std::swap(nan_test, cmp); - else if (nan_test->opcode != expected_nan_test) + else if (get_f32_cmp(nan_test->opcode) != expected_nan_test) return false; - if (!is_cmp(cmp->opcode)) + if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode)) return false; if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) @@ -1696,7 +1844,7 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in new_instr->definitions[0] = instr->definitions[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); instr.reset(new_instr); @@ -1745,7 +1893,7 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr& instr) new_instr->definitions[0] = instr->definitions[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); instr.reset(new_instr); @@ -1843,7 +1991,7 @@ void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr& bool combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops) { - uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + uint64_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & (label_omod_success | label_clamp_success); for (unsigned swap = 0; swap < 2; swap++) { @@ -1874,7 +2022,7 @@ bool combine_minmax(opt_ctx& ctx, aco_ptr& instr, aco_opcode opposi if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2)) return true; - uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + uint64_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & (label_omod_success | label_clamp_success); /* min(-max(a, b), c) -> min3(-a, -b, c) * @@ -2128,7 +2276,7 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, else return false; - uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + uint64_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & (label_omod_success | label_clamp_success); for (unsigned swap = 0; swap < 2; swap++) { @@ -2323,7 +2471,7 @@ bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) /* omod was successfully applied */ /* if the omod instruction is v_mad, we also have to change the original add */ if (ctx.info[instr->operands[idx].tempId()].is_mad()) { - Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get(); + Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].instr->pass_flags].add_instr.get(); if (ctx.info[instr->definitions[0].tempId()].is_clamp()) static_cast(add_instr)->clamp = true; add_instr->definitions[0] = instr->definitions[0]; @@ -2369,7 +2517,7 @@ bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) /* clamp was successfully applied */ /* if the clamp instruction is v_mad, we also have to change the original add */ if (ctx.info[instr->operands[idx].tempId()].is_mad()) { - Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get(); + Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].instr->pass_flags].add_instr.get(); add_instr->definitions[0] = instr->definitions[0]; } Instruction* clamp_instr = ctx.info[instr->operands[idx].tempId()].instr; @@ -2487,7 +2635,7 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr instr->opcode == aco_opcode::v_sub_f16 || instr->opcode == aco_opcode::v_subrev_f16; if (mad16 || mad32) { - bool need_fma = mad32 ? block.fp_mode.denorm32 != 0 : + bool need_fma = mad32 ? (block.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3) : (block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10); if (need_fma && instr->definitions[0].isPrecise()) return; @@ -2791,7 +2939,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) mad_info* mad_info = NULL; if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) { - mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; + mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags]; /* re-check mad instructions */ if (ctx.uses[mad_info->mul_temp_id]) { ctx.uses[mad_info->mul_temp_id]++; @@ -2973,7 +3121,7 @@ void apply_literals(opt_ctx &ctx, aco_ptr& instr) /* apply literals on MAD */ if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) { - mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; + mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags]; if (info->check_literal && (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) { aco_ptr new_mad;