X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_assembler.cpp;h=3bdea63201d0c893bcd737ce80da0e9962b15588;hb=897a47d84771fb367fa82fd9656b3de20197952c;hp=272be08858911b61892c262e2ca08ccbff137ba6;hpb=818bdab796772da77a363f0a96e8895736591aac;p=mesa.git diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 272be088589..3bdea63201d 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -1,21 +1,25 @@ -#include +#include +#include #include "aco_ir.h" #include "common/sid.h" #include "ac_shader_util.h" +#include "util/u_math.h" namespace aco { struct asm_context { Program *program; enum chip_class chip_class; - std::map branches; + std::vector> branches; std::vector constaddrs; const int16_t* opcode; // TODO: keep track of branch instructions referring blocks // and, when emitting the block, correct the offset in instr asm_context(Program* program) : program(program), chip_class(program->chip_class) { - if (chip_class <= GFX9) + if (chip_class <= GFX7) + opcode = &instr_info.opcode_gfx7[0]; + else if (chip_class <= GFX9) opcode = &instr_info.opcode_gfx9[0]; else if (chip_class == GFX10) opcode = &instr_info.opcode_gfx10[0]; @@ -24,10 +28,20 @@ struct asm_context { int subvector_begin_pos = -1; }; -void emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr) +static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg) { - uint32_t instr_offset = out.size() * 4u; + if (sel & sdwa_isra) { + unsigned size = sdwa_rasize & sel; + if (size == 1) + return reg.byte(); + else /* size == 2 */ + return sdwa_isword | (reg.byte() >> 1); + } + return sel & sdwa_asuint; +} +void emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr) +{ /* lower remaining pseudo-instructions */ if (instr->opcode == aco_opcode::p_constaddr) { unsigned dest = instr->definitions[0].physReg(); @@ -52,7 +66,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= 255 << 8; out.push_back(encoding); ctx.constaddrs.push_back(out.size()); - out.push_back(-(instr_offset + 4) + offset); + out.push_back(offset); /* s_addc_u32 dest[1], dest[1], 0 */ encoding = (0b10 << 30); @@ -103,7 +117,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ? instr->definitions[0].physReg() << 16 : - !instr->operands.empty() && !(instr->operands[0].physReg() == scc) ? + !instr->operands.empty() && instr->operands[0].physReg() <= 127 ? instr->operands[0].physReg() << 16 : 0; encoding |= sopk->imm; out.push_back(encoding); @@ -135,7 +149,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= opcode << 16; encoding |= (uint16_t) sopp->imm; if (sopp->block != -1) - ctx.branches.insert({out.size(), sopp}); + ctx.branches.emplace_back(out.size(), sopp); out.push_back(encoding); break; } @@ -143,9 +157,28 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* SMEM_instruction* smem = static_cast(instr); bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); bool is_load = !instr->definitions.empty(); - uint32_t encoding = 0; + if (ctx.chip_class <= GFX7) { + encoding = (0b11000 << 27); + encoding |= opcode << 22; + encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0; + encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0; + if (instr->operands.size() >= 2) { + if (!instr->operands[1].isConstant() || instr->operands[1].constantValue() >= 1024) { + encoding |= instr->operands[1].physReg().reg(); + } else { + encoding |= instr->operands[1].constantValue() >> 2; + encoding |= 1 << 8; + } + } + out.push_back(encoding); + /* SMRD instructions can take a literal on GFX6 & GFX7 */ + if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && instr->operands[1].constantValue() >= 1024) + out.push_back(instr->operands[1].constantValue() >> 2); + return; + } + if (ctx.chip_class <= GFX9) { encoding = (0b110000 << 26); assert(!smem->dlc); /* Device-level coherent is not supported on GFX9 and lower */ @@ -168,10 +201,10 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } if (is_load || instr->operands.size() >= 3) { /* SDATA */ - encoding |= (is_load ? instr->definitions[0].physReg().reg : instr->operands[2].physReg().reg) << 6; + encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) << 6; } if (instr->operands.size() >= 1) { /* SBASE */ - encoding |= instr->operands[0].physReg().reg >> 1; + encoding |= instr->operands[0].physReg() >> 1; } out.push_back(encoding); @@ -211,25 +244,27 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* case Format::VOP2: { uint32_t encoding = 0; encoding |= opcode << 25; - encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17; - encoding |= (0xFF & instr->operands[1].physReg().reg) << 9; - encoding |= instr->operands[0].physReg().reg; + encoding |= (0xFF & instr->definitions[0].physReg()) << 17; + encoding |= (0xFF & instr->operands[1].physReg()) << 9; + encoding |= instr->operands[0].physReg(); out.push_back(encoding); break; } case Format::VOP1: { uint32_t encoding = (0b0111111 << 25); - encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17; + if (!instr->definitions.empty()) + encoding |= (0xFF & instr->definitions[0].physReg()) << 17; encoding |= opcode << 9; - encoding |= instr->operands[0].physReg().reg; + if (!instr->operands.empty()) + encoding |= instr->operands[0].physReg(); out.push_back(encoding); break; } case Format::VOPC: { uint32_t encoding = (0b0111110 << 25); encoding |= opcode << 17; - encoding |= (0xFF & instr->operands[1].physReg().reg) << 9; - encoding |= instr->operands[0].physReg().reg; + encoding |= (0xFF & instr->operands[1].physReg()) << 9; + encoding |= instr->operands[0].physReg(); out.push_back(encoding); break; } @@ -237,22 +272,50 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* Interp_instruction* interp = static_cast(instr); uint32_t encoding = 0; - if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { - encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */ + if (instr->opcode == aco_opcode::v_interp_p1ll_f16 || + instr->opcode == aco_opcode::v_interp_p1lv_f16 || + instr->opcode == aco_opcode::v_interp_p2_legacy_f16 || + instr->opcode == aco_opcode::v_interp_p2_f16) { + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { + encoding = (0b110100 << 26); + } else if (ctx.chip_class == GFX10) { + encoding = (0b110101 << 26); + } else { + unreachable("Unknown chip_class."); + } + + encoding |= opcode << 16; + encoding |= (0xFF & instr->definitions[0].physReg()); + out.push_back(encoding); + + encoding = 0; + encoding |= interp->attribute; + encoding |= interp->component << 6; + encoding |= instr->operands[0].physReg() << 9; + if (instr->opcode == aco_opcode::v_interp_p2_f16 || + instr->opcode == aco_opcode::v_interp_p2_legacy_f16 || + instr->opcode == aco_opcode::v_interp_p1lv_f16) { + encoding |= instr->operands[2].physReg() << 18; + } + out.push_back(encoding); } else { - encoding = (0b110010 << 26); - } + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { + encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */ + } else { + encoding = (0b110010 << 26); + } - assert(encoding); - encoding |= (0xFF & instr->definitions[0].physReg().reg) << 18; - encoding |= opcode << 16; - encoding |= interp->attribute << 10; - encoding |= interp->component << 8; - if (instr->opcode == aco_opcode::v_interp_mov_f32) - encoding |= (0x3 & instr->operands[0].constantValue()); - else - encoding |= (0xFF & instr->operands[0].physReg().reg); - out.push_back(encoding); + assert(encoding); + encoding |= (0xFF & instr->definitions[0].physReg()) << 18; + encoding |= opcode << 16; + encoding |= interp->attribute << 10; + encoding |= interp->component << 8; + if (instr->opcode == aco_opcode::v_interp_mov_f32) + encoding |= (0x3 & instr->operands[0].constantValue()); + else + encoding |= (0xFF & instr->operands[0].physReg()); + out.push_back(encoding); + } break; } case Format::DS: { @@ -275,7 +338,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= (0xFF & reg) << 16; reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0; encoding |= (0xFF & reg) << 8; - encoding |= (0xFF & instr->operands[0].physReg().reg); + encoding |= (0xFF & instr->operands[0].physReg()); out.push_back(encoding); break; } @@ -286,8 +349,11 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= (mubuf->lds ? 1 : 0) << 16; encoding |= (mubuf->glc ? 1 : 0) << 14; encoding |= (mubuf->idxen ? 1 : 0) << 13; + assert(!mubuf->addr64 || ctx.chip_class <= GFX7); + if (ctx.chip_class == GFX6 || ctx.chip_class == GFX7) + encoding |= (mubuf->addr64 ? 1 : 0) << 15; encoding |= (mubuf->offen ? 1 : 0) << 12; - if (ctx.chip_class <= GFX9) { + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { assert(!mubuf->dlc); /* Device-level coherent is not supported on GFX9 and lower */ encoding |= (mubuf->slc ? 1 : 0) << 17; } else if (ctx.chip_class >= GFX10) { @@ -296,15 +362,15 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= 0x0FFF & mubuf->offset; out.push_back(encoding); encoding = 0; - if (ctx.chip_class >= GFX10) { + if (ctx.chip_class <= GFX7 || ctx.chip_class >= GFX10) { encoding |= (mubuf->slc ? 1 : 0) << 22; } encoding |= instr->operands[2].physReg() << 24; encoding |= (mubuf->tfe ? 1 : 0) << 23; - encoding |= (instr->operands[1].physReg() >> 2) << 16; - unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg().reg; + encoding |= (instr->operands[0].physReg() >> 2) << 16; + unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg(); encoding |= (0xFF & reg) << 8; - encoding |= (0xFF & instr->operands[0].physReg().reg); + encoding |= (0xFF & instr->operands[1].physReg()); out.push_back(encoding); break; } @@ -313,6 +379,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf->dfmt, mtbuf->nfmt); uint32_t encoding = (0b111010 << 26); + assert(img_format <= 0x7F); assert(!mtbuf->dlc || ctx.chip_class >= GFX10); encoding |= (mtbuf->dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */ encoding |= (mtbuf->glc ? 1 : 0) << 14; @@ -321,7 +388,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= 0x0FFF & mtbuf->offset; encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */ - if (ctx.chip_class <= GFX9) { + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { encoding |= opcode << 15; } else { encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */ @@ -330,16 +397,16 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* out.push_back(encoding); encoding = 0; - encoding |= instr->operands[2].physReg().reg << 24; + encoding |= instr->operands[2].physReg() << 24; encoding |= (mtbuf->tfe ? 1 : 0) << 23; encoding |= (mtbuf->slc ? 1 : 0) << 22; - encoding |= (instr->operands[1].physReg().reg >> 2) << 16; - unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg().reg : instr->definitions[0].physReg().reg; + encoding |= (instr->operands[0].physReg() >> 2) << 16; + unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg(); encoding |= (0xFF & reg) << 8; - encoding |= (0xFF & instr->operands[0].physReg().reg); + encoding |= (0xFF & instr->operands[1].physReg()); if (ctx.chip_class >= GFX10) { - encoding |= (((opcode & 0x08) >> 4) << 21); /* MSB of 4-bit OPCODE */ + encoding |= (((opcode & 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */ } out.push_back(encoding); @@ -366,15 +433,15 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } encoding |= (0xF & mimg->dmask) << 8; out.push_back(encoding); - encoding = (0xFF & instr->operands[0].physReg().reg); /* VADDR */ + encoding = (0xFF & instr->operands[2].physReg()); /* VADDR */ if (!instr->definitions.empty()) { - encoding |= (0xFF & instr->definitions[0].physReg().reg) << 8; /* VDATA */ - } else if (instr->operands.size() == 4) { - encoding |= (0xFF & instr->operands[3].physReg().reg) << 8; /* VDATA */ + encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */ + } else if (instr->operands[1].regClass().type() == RegType::vgpr) { + encoding |= (0xFF & instr->operands[1].physReg()) << 8; /* VDATA */ } - encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 16; /* T# (resource) */ - if (instr->operands.size() > 2) - encoding |= (0x1F & (instr->operands[2].physReg() >> 2)) << 21; /* sampler */ + encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */ + if (instr->operands[1].regClass().type() == RegType::sgpr) + encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */ assert(!mimg->d16 || ctx.chip_class >= GFX9); encoding |= mimg->d16 ? 1 << 15 : 0; @@ -394,9 +461,14 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* if (ctx.chip_class <= GFX9) { assert(flat->offset <= 0x1fff); encoding |= flat->offset & 0x1fff; + } else if (instr->format == Format::FLAT) { + /* GFX10 has a 12-bit immediate OFFSET field, + * but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug + */ + assert(flat->offset == 0); } else { - assert(flat->offset <= 0x0fff); - encoding |= flat->offset & 0x0fff; + assert(flat->offset <= 0xfff); + encoding |= flat->offset & 0xfff; } if (instr->format == Format::SCRATCH) encoding |= 1 << 14; @@ -415,13 +487,13 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding = (0xFF & instr->operands[0].physReg()); if (!instr->definitions.empty()) encoding |= (0xFF & instr->definitions[0].physReg()) << 24; - else + if (instr->operands.size() >= 3) encoding |= (0xFF & instr->operands[2].physReg()) << 8; if (!instr->operands[1].isUndefined()) { assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F); assert(instr->format != Format::FLAT); encoding |= instr->operands[1].physReg() << 16; - } else if (instr->format != Format::FLAT) { + } else if (instr->format != Format::FLAT || ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */ if (ctx.chip_class <= GFX9) encoding |= 0x7F << 16; else @@ -434,9 +506,9 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* case Format::EXP: { Export_instruction* exp = static_cast(instr); uint32_t encoding; - if (ctx.chip_class <= GFX9) { + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { encoding = (0b110001 << 26); - } else if (ctx.chip_class >= GFX10) { + } else { encoding = (0b111110 << 26); } @@ -446,10 +518,10 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= exp->dest << 4; encoding |= exp->enabled_mask; out.push_back(encoding); - encoding = 0xFF & exp->operands[0].physReg().reg; - encoding |= (0xFF & exp->operands[1].physReg().reg) << 8; - encoding |= (0xFF & exp->operands[2].physReg().reg) << 16; - encoding |= (0xFF & exp->operands[3].physReg().reg) << 24; + encoding = 0xFF & exp->operands[0].physReg(); + encoding |= (0xFF & exp->operands[1].physReg()) << 8; + encoding |= (0xFF & exp->operands[2].physReg()) << 16; + encoding |= (0xFF & exp->operands[3].physReg()) << 24; out.push_back(encoding); break; } @@ -463,33 +535,38 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* if ((uint16_t) instr->format & (uint16_t) Format::VOP2) { opcode = opcode + 0x100; } else if ((uint16_t) instr->format & (uint16_t) Format::VOP1) { - if (ctx.chip_class <= GFX9) { + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) opcode = opcode + 0x140; - } else { - /* RDNA ISA doc says this is 0x140, but that doesn't work */ + else opcode = opcode + 0x180; - } } else if ((uint16_t) instr->format & (uint16_t) Format::VOPC) { opcode = opcode + 0x0; } else if ((uint16_t) instr->format & (uint16_t) Format::VINTRP) { opcode = opcode + 0x270; } - // TODO: op_sel uint32_t encoding; if (ctx.chip_class <= GFX9) { encoding = (0b110100 << 26); } else if (ctx.chip_class == GFX10) { encoding = (0b110101 << 26); + } else { + unreachable("Unknown chip_class."); } - encoding |= opcode << 16; - encoding |= (vop3->clamp ? 1 : 0) << 15; + if (ctx.chip_class <= GFX7) { + encoding |= opcode << 17; + encoding |= (vop3->clamp ? 1 : 0) << 11; + } else { + encoding |= opcode << 16; + encoding |= (vop3->clamp ? 1 : 0) << 15; + } + encoding |= vop3->opsel << 11; for (unsigned i = 0; i < 3; i++) encoding |= vop3->abs[i] << (8+i); if (instr->definitions.size() == 2) encoding |= instr->definitions[1].physReg() << 8; - encoding |= (0xFF & instr->definitions[0].physReg().reg); + encoding |= (0xFF & instr->definitions[0].physReg()); out.push_back(encoding); encoding = 0; if (instr->opcode == aco_opcode::v_interp_mov_f32) { @@ -503,11 +580,40 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= vop3->neg[i] << (29+i); out.push_back(encoding); + } else if (instr->format == Format::VOP3P) { + VOP3P_instruction* vop3 = static_cast(instr); + + uint32_t encoding; + if (ctx.chip_class == GFX9) { + encoding = (0b110100111 << 23); + } else if (ctx.chip_class == GFX10) { + encoding = (0b110011 << 26); + } else { + unreachable("Unknown chip_class."); + } + + encoding |= opcode << 16; + encoding |= (vop3->clamp ? 1 : 0) << 15; + encoding |= vop3->opsel_lo << 11; + encoding |= (vop3->opsel_hi & 0x4) ? 1 : 0 << 14; + for (unsigned i = 0; i < 3; i++) + encoding |= vop3->neg_hi[i] << (8+i); + encoding |= (0xFF & instr->definitions[0].physReg()); + out.push_back(encoding); + encoding = 0; + for (unsigned i = 0; i < instr->operands.size(); i++) + encoding |= instr->operands[i].physReg() << (i * 9); + encoding |= vop3->opsel_hi & 0x3 << 27; + for (unsigned i = 0; i < 3; i++) + encoding |= vop3->neg_lo[i] << (29+i); + out.push_back(encoding); + } else if (instr->isDPP()){ + assert(ctx.chip_class >= GFX8); /* first emit the instruction without the DPP operand */ Operand dpp_op = instr->operands[0]; instr->operands[0] = Operand(PhysReg{250}, v1); - instr->format = (Format) ((uint32_t) instr->format & ~(1 << 14)); + instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::DPP); emit_instruction(ctx, out, instr); DPP_instruction* dpp = static_cast(instr); uint32_t encoding = (0xF & dpp->row_mask) << 28; @@ -518,9 +624,52 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= dpp->neg[0] << 20; encoding |= dpp->bound_ctrl << 19; encoding |= dpp->dpp_ctrl << 8; - encoding |= (0xFF) & dpp_op.physReg().reg; + encoding |= (0xFF) & dpp_op.physReg(); out.push_back(encoding); return; + } else if (instr->isSDWA()) { + /* first emit the instruction without the SDWA operand */ + Operand sdwa_op = instr->operands[0]; + instr->operands[0] = Operand(PhysReg{249}, v1); + instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::SDWA); + emit_instruction(ctx, out, instr); + + SDWA_instruction* sdwa = static_cast(instr); + uint32_t encoding = 0; + + if ((uint16_t)instr->format & (uint16_t)Format::VOPC) { + if (instr->definitions[0].physReg() != vcc) { + encoding |= instr->definitions[0].physReg() << 8; + encoding |= 1 << 15; + } + encoding |= (sdwa->clamp ? 1 : 0) << 13; + } else { + encoding |= get_sdwa_sel(sdwa->dst_sel, instr->definitions[0].physReg()) << 8; + uint32_t dst_u = sdwa->dst_sel & sdwa_sext ? 1 : 0; + if (sdwa->dst_preserve || (sdwa->dst_sel & sdwa_isra)) + dst_u = 2; + encoding |= dst_u << 11; + encoding |= (sdwa->clamp ? 1 : 0) << 13; + encoding |= sdwa->omod << 14; + } + + encoding |= get_sdwa_sel(sdwa->sel[0], sdwa_op.physReg()) << 16; + encoding |= sdwa->sel[0] & sdwa_sext ? 1 << 19 : 0; + encoding |= sdwa->abs[0] << 21; + encoding |= sdwa->neg[0] << 20; + + if (instr->operands.size() >= 2) { + encoding |= get_sdwa_sel(sdwa->sel[1], instr->operands[1].physReg()) << 24; + encoding |= sdwa->sel[1] & sdwa_sext ? 1 << 27 : 0; + encoding |= sdwa->abs[1] << 29; + encoding |= sdwa->neg[1] << 28; + } + + encoding |= 0xFF & sdwa_op.physReg(); + encoding |= (sdwa_op.physReg() < 256) << 23; + if (instr->operands.size() >= 2) + encoding |= (instr->operands[1].physReg() < 256) << 31; + out.push_back(encoding); } else { unreachable("unimplemented instruction format"); } @@ -555,16 +704,16 @@ void emit_block(asm_context& ctx, std::vector& out, Block& block) void fix_exports(asm_context& ctx, std::vector& out, Program* program) { - for (int idx = program->blocks.size() - 1; idx >= 0; idx--) { - Block& block = program->blocks[idx]; + bool exported = false; + for (Block& block : program->blocks) { + if (!(block.kind & block_kind_export_end)) + continue; std::vector>::reverse_iterator it = block.instructions.rbegin(); - bool endBlock = false; - bool exported = false; while ( it != block.instructions.rend()) { - if ((*it)->format == Format::EXP && endBlock) { + if ((*it)->format == Format::EXP) { Export_instruction* exp = static_cast((*it).get()); - if (program->stage & hw_vs) { + if (program->stage & (hw_vs | hw_ngg_gs)) { if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) { exp->done = true; exported = true; @@ -578,36 +727,66 @@ void fix_exports(asm_context& ctx, std::vector& out, Program* program) } } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec) break; - else if ((*it)->opcode == aco_opcode::s_endpgm) { - if (endBlock) - break; - endBlock = true; - } ++it; } - if (!endBlock || exported) - continue; - /* we didn't find an Export instruction and have to insert a null export */ - aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; - for (unsigned i = 0; i < 4; i++) - exp->operands[i] = Operand(v1); - exp->enabled_mask = 0; - exp->compressed = false; - exp->done = true; - exp->valid_mask = program->stage & hw_fs; - if (program->stage & hw_fs) - exp->dest = 9; /* NULL */ - else - exp->dest = V_008DFC_SQ_EXP_POS; - /* insert the null export 1 instruction before endpgm */ - block.instructions.insert(block.instructions.end() - 1, std::move(exp)); + } + + if (!exported) { + /* Abort in order to avoid a GPU hang. */ + fprintf(stderr, "Missing export in %s shader:\n", (program->stage & hw_vs) ? "vertex" : "fragment"); + aco_print_program(program, stderr); + abort(); } } +static void fix_branches_gfx10(asm_context& ctx, std::vector& out) +{ + /* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */ + bool gfx10_3f_bug = false; + + do { + auto buggy_branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [&ctx](const auto &branch) -> bool { + return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 0x3f; + }); + + gfx10_3f_bug = buggy_branch_it != ctx.branches.end(); + + if (gfx10_3f_bug) { + /* Insert an s_nop after the branch */ + constexpr uint32_t s_nop_0 = 0xbf800000u; + int s_nop_pos = buggy_branch_it->first + 1; + auto out_pos = std::next(out.begin(), s_nop_pos); + out.insert(out_pos, s_nop_0); + + /* Update the offset of each affected block */ + for (Block& block : ctx.program->blocks) { + if (block.offset > (unsigned)buggy_branch_it->first) + block.offset++; + } + + /* Update the branches following the current one */ + for (auto branch_it = std::next(buggy_branch_it); branch_it != ctx.branches.end(); ++branch_it) + branch_it->first++; + + /* Find first constant address after the inserted instruction */ + auto caddr_it = std::find_if(ctx.constaddrs.begin(), ctx.constaddrs.end(), [s_nop_pos](const int &caddr_pos) -> bool { + return caddr_pos >= s_nop_pos; + }); + + /* Update the locations of constant addresses */ + for (; caddr_it != ctx.constaddrs.end(); ++caddr_it) + (*caddr_it)++; + + } + } while (gfx10_3f_bug); +} + void fix_branches(asm_context& ctx, std::vector& out) { - for (std::pair branch : ctx.branches) - { + if (ctx.chip_class >= GFX10) + fix_branches_gfx10(ctx, out); + + for (std::pair &branch : ctx.branches) { int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1; out[branch.first] |= (uint16_t) offset; } @@ -616,7 +795,7 @@ void fix_branches(asm_context& ctx, std::vector& out) void fix_constaddrs(asm_context& ctx, std::vector& out) { for (unsigned addr : ctx.constaddrs) - out[addr] += out.size() * 4u; + out[addr] += (out.size() - addr + 1u) * 4u; } unsigned emit_program(Program* program, @@ -624,7 +803,7 @@ unsigned emit_program(Program* program, { asm_context ctx(program); - if (program->stage & (hw_vs | hw_fs)) + if (program->stage & (hw_vs | hw_fs | hw_ngg_gs)) fix_exports(ctx, code, program); for (Block& block : program->blocks) { @@ -633,16 +812,26 @@ unsigned emit_program(Program* program, } fix_branches(ctx, code); + + unsigned exec_size = code.size() * sizeof(uint32_t); + + if (program->chip_class >= GFX10) { + /* Pad output with s_code_end so instruction prefetching doesn't cause + * page faults */ + unsigned final_size = align(code.size() + 3 * 16, 16); + while (code.size() < final_size) + code.push_back(0xbf9f0000u); + } + fix_constaddrs(ctx, code); - unsigned constant_data_offset = code.size() * sizeof(uint32_t); while (program->constant_data.size() % 4u) program->constant_data.push_back(0); /* Copy constant data */ code.insert(code.end(), (uint32_t*)program->constant_data.data(), (uint32_t*)(program->constant_data.data() + program->constant_data.size())); - return constant_data_offset; + return exec_size; } }