5 #include "common/sid.h"
6 #include "ac_shader_util.h"
7 #include "util/u_math.h"
13 enum chip_class chip_class
;
14 std::vector
<std::pair
<int, SOPP_instruction
*>> branches
;
15 std::vector
<unsigned> constaddrs
;
16 const int16_t* opcode
;
17 // TODO: keep track of branch instructions referring blocks
18 // and, when emitting the block, correct the offset in instr
19 asm_context(Program
* program
) : program(program
), chip_class(program
->chip_class
) {
20 if (chip_class
<= GFX7
)
21 opcode
= &instr_info
.opcode_gfx7
[0];
22 else if (chip_class
<= GFX9
)
23 opcode
= &instr_info
.opcode_gfx9
[0];
24 else if (chip_class
>= GFX10
)
25 opcode
= &instr_info
.opcode_gfx10
[0];
28 int subvector_begin_pos
= -1;
31 static uint32_t get_sdwa_sel(unsigned sel
, PhysReg reg
)
33 if (sel
& sdwa_isra
) {
34 unsigned size
= sdwa_rasize
& sel
;
38 return sdwa_isword
| (reg
.byte() >> 1);
40 return sel
& sdwa_asuint
;
43 void emit_instruction(asm_context
& ctx
, std::vector
<uint32_t>& out
, Instruction
* instr
)
45 /* lower remaining pseudo-instructions */
46 if (instr
->opcode
== aco_opcode::p_constaddr
) {
47 unsigned dest
= instr
->definitions
[0].physReg();
48 unsigned offset
= instr
->operands
[0].constantValue();
50 /* s_getpc_b64 dest[0:1] */
51 uint32_t encoding
= (0b101111101 << 23);
52 uint32_t opcode
= ctx
.opcode
[(int)aco_opcode::s_getpc_b64
];
53 if (opcode
>= 55 && ctx
.chip_class
<= GFX9
) {
54 assert(ctx
.chip_class
== GFX9
&& opcode
< 60);
57 encoding
|= dest
<< 16;
58 encoding
|= opcode
<< 8;
59 out
.push_back(encoding
);
61 /* s_add_u32 dest[0], dest[0], ... */
62 encoding
= (0b10 << 30);
63 encoding
|= ctx
.opcode
[(int)aco_opcode::s_add_u32
] << 23;
64 encoding
|= dest
<< 16;
67 out
.push_back(encoding
);
68 ctx
.constaddrs
.push_back(out
.size());
69 out
.push_back(offset
);
71 /* s_addc_u32 dest[1], dest[1], 0 */
72 encoding
= (0b10 << 30);
73 encoding
|= ctx
.opcode
[(int)aco_opcode::s_addc_u32
] << 23;
74 encoding
|= (dest
+ 1) << 16;
77 out
.push_back(encoding
);
81 uint32_t opcode
= ctx
.opcode
[(int)instr
->opcode
];
82 if (opcode
== (uint32_t)-1) {
83 fprintf(stderr
, "Unsupported opcode: ");
84 aco_print_instr(instr
, stderr
);
88 switch (instr
->format
) {
90 uint32_t encoding
= (0b10 << 30);
91 encoding
|= opcode
<< 23;
92 encoding
|= !instr
->definitions
.empty() ? instr
->definitions
[0].physReg() << 16 : 0;
93 encoding
|= instr
->operands
.size() >= 2 ? instr
->operands
[1].physReg() << 8 : 0;
94 encoding
|= !instr
->operands
.empty() ? instr
->operands
[0].physReg() : 0;
95 out
.push_back(encoding
);
99 SOPK_instruction
*sopk
= static_cast<SOPK_instruction
*>(instr
);
101 if (instr
->opcode
== aco_opcode::s_subvector_loop_begin
) {
102 assert(ctx
.chip_class
>= GFX10
);
103 assert(ctx
.subvector_begin_pos
== -1);
104 ctx
.subvector_begin_pos
= out
.size();
105 } else if (instr
->opcode
== aco_opcode::s_subvector_loop_end
) {
106 assert(ctx
.chip_class
>= GFX10
);
107 assert(ctx
.subvector_begin_pos
!= -1);
108 /* Adjust s_subvector_loop_begin instruction to the address after the end */
109 out
[ctx
.subvector_begin_pos
] |= (out
.size() - ctx
.subvector_begin_pos
);
110 /* Adjust s_subvector_loop_end instruction to the address after the beginning */
111 sopk
->imm
= (uint16_t)(ctx
.subvector_begin_pos
- (int)out
.size());
112 ctx
.subvector_begin_pos
= -1;
115 uint32_t encoding
= (0b1011 << 28);
116 encoding
|= opcode
<< 23;
118 !instr
->definitions
.empty() && !(instr
->definitions
[0].physReg() == scc
) ?
119 instr
->definitions
[0].physReg() << 16 :
120 !instr
->operands
.empty() && instr
->operands
[0].physReg() <= 127 ?
121 instr
->operands
[0].physReg() << 16 : 0;
122 encoding
|= sopk
->imm
;
123 out
.push_back(encoding
);
127 uint32_t encoding
= (0b101111101 << 23);
128 if (opcode
>= 55 && ctx
.chip_class
<= GFX9
) {
129 assert(ctx
.chip_class
== GFX9
&& opcode
< 60);
132 encoding
|= !instr
->definitions
.empty() ? instr
->definitions
[0].physReg() << 16 : 0;
133 encoding
|= opcode
<< 8;
134 encoding
|= !instr
->operands
.empty() ? instr
->operands
[0].physReg() : 0;
135 out
.push_back(encoding
);
139 uint32_t encoding
= (0b101111110 << 23);
140 encoding
|= opcode
<< 16;
141 encoding
|= instr
->operands
.size() == 2 ? instr
->operands
[1].physReg() << 8 : 0;
142 encoding
|= !instr
->operands
.empty() ? instr
->operands
[0].physReg() : 0;
143 out
.push_back(encoding
);
147 SOPP_instruction
* sopp
= static_cast<SOPP_instruction
*>(instr
);
148 uint32_t encoding
= (0b101111111 << 23);
149 encoding
|= opcode
<< 16;
150 encoding
|= (uint16_t) sopp
->imm
;
151 if (sopp
->block
!= -1)
152 ctx
.branches
.emplace_back(out
.size(), sopp
);
153 out
.push_back(encoding
);
157 SMEM_instruction
* smem
= static_cast<SMEM_instruction
*>(instr
);
158 bool soe
= instr
->operands
.size() >= (!instr
->definitions
.empty() ? 3 : 4);
159 bool is_load
= !instr
->definitions
.empty();
160 uint32_t encoding
= 0;
162 if (ctx
.chip_class
<= GFX7
) {
163 encoding
= (0b11000 << 27);
164 encoding
|= opcode
<< 22;
165 encoding
|= instr
->definitions
.size() ? instr
->definitions
[0].physReg() << 15 : 0;
166 encoding
|= instr
->operands
.size() ? (instr
->operands
[0].physReg() >> 1) << 9 : 0;
167 if (instr
->operands
.size() >= 2) {
168 if (!instr
->operands
[1].isConstant() || instr
->operands
[1].constantValue() >= 1024) {
169 encoding
|= instr
->operands
[1].physReg().reg();
171 encoding
|= instr
->operands
[1].constantValue() >> 2;
175 out
.push_back(encoding
);
176 /* SMRD instructions can take a literal on GFX6 & GFX7 */
177 if (instr
->operands
.size() >= 2 && instr
->operands
[1].isConstant() && instr
->operands
[1].constantValue() >= 1024)
178 out
.push_back(instr
->operands
[1].constantValue() >> 2);
182 if (ctx
.chip_class
<= GFX9
) {
183 encoding
= (0b110000 << 26);
184 assert(!smem
->dlc
); /* Device-level coherent is not supported on GFX9 and lower */
185 encoding
|= smem
->nv
? 1 << 15 : 0;
187 encoding
= (0b111101 << 26);
188 assert(!smem
->nv
); /* Non-volatile is not supported on GFX10 */
189 encoding
|= smem
->dlc
? 1 << 14 : 0;
192 encoding
|= opcode
<< 18;
193 encoding
|= smem
->glc
? 1 << 16 : 0;
195 if (ctx
.chip_class
<= GFX9
) {
196 if (instr
->operands
.size() >= 2)
197 encoding
|= instr
->operands
[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */
199 if (ctx
.chip_class
== GFX9
) {
200 encoding
|= soe
? 1 << 14 : 0;
203 if (is_load
|| instr
->operands
.size() >= 3) { /* SDATA */
204 encoding
|= (is_load
? instr
->definitions
[0].physReg() : instr
->operands
[2].physReg()) << 6;
206 if (instr
->operands
.size() >= 1) { /* SBASE */
207 encoding
|= instr
->operands
[0].physReg() >> 1;
210 out
.push_back(encoding
);
214 uint32_t soffset
= ctx
.chip_class
>= GFX10
215 ? sgpr_null
/* On GFX10 this is disabled by specifying SGPR_NULL */
216 : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */
217 if (instr
->operands
.size() >= 2) {
218 const Operand
&op_off1
= instr
->operands
[1];
219 if (ctx
.chip_class
<= GFX9
) {
220 offset
= op_off1
.isConstant() ? op_off1
.constantValue() : op_off1
.physReg();
222 /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */
223 if (op_off1
.isConstant()) {
224 offset
= op_off1
.constantValue();
226 soffset
= op_off1
.physReg();
227 assert(!soe
); /* There is no place to put the other SGPR offset, if any */
232 const Operand
&op_off2
= instr
->operands
.back();
233 assert(ctx
.chip_class
>= GFX9
); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */
234 assert(!op_off2
.isConstant());
235 soffset
= op_off2
.physReg();
239 encoding
|= soffset
<< 25;
241 out
.push_back(encoding
);
245 uint32_t encoding
= 0;
246 encoding
|= opcode
<< 25;
247 encoding
|= (0xFF & instr
->definitions
[0].physReg()) << 17;
248 encoding
|= (0xFF & instr
->operands
[1].physReg()) << 9;
249 encoding
|= instr
->operands
[0].physReg();
250 out
.push_back(encoding
);
254 uint32_t encoding
= (0b0111111 << 25);
255 if (!instr
->definitions
.empty())
256 encoding
|= (0xFF & instr
->definitions
[0].physReg()) << 17;
257 encoding
|= opcode
<< 9;
258 if (!instr
->operands
.empty())
259 encoding
|= instr
->operands
[0].physReg();
260 out
.push_back(encoding
);
264 uint32_t encoding
= (0b0111110 << 25);
265 encoding
|= opcode
<< 17;
266 encoding
|= (0xFF & instr
->operands
[1].physReg()) << 9;
267 encoding
|= instr
->operands
[0].physReg();
268 out
.push_back(encoding
);
271 case Format::VINTRP
: {
272 Interp_instruction
* interp
= static_cast<Interp_instruction
*>(instr
);
273 uint32_t encoding
= 0;
275 if (instr
->opcode
== aco_opcode::v_interp_p1ll_f16
||
276 instr
->opcode
== aco_opcode::v_interp_p1lv_f16
||
277 instr
->opcode
== aco_opcode::v_interp_p2_legacy_f16
||
278 instr
->opcode
== aco_opcode::v_interp_p2_f16
) {
279 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
) {
280 encoding
= (0b110100 << 26);
281 } else if (ctx
.chip_class
>= GFX10
) {
282 encoding
= (0b110101 << 26);
284 unreachable("Unknown chip_class.");
287 encoding
|= opcode
<< 16;
288 encoding
|= (0xFF & instr
->definitions
[0].physReg());
289 out
.push_back(encoding
);
292 encoding
|= interp
->attribute
;
293 encoding
|= interp
->component
<< 6;
294 encoding
|= instr
->operands
[0].physReg() << 9;
295 if (instr
->opcode
== aco_opcode::v_interp_p2_f16
||
296 instr
->opcode
== aco_opcode::v_interp_p2_legacy_f16
||
297 instr
->opcode
== aco_opcode::v_interp_p1lv_f16
) {
298 encoding
|= instr
->operands
[2].physReg() << 18;
300 out
.push_back(encoding
);
302 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
) {
303 encoding
= (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */
305 encoding
= (0b110010 << 26);
309 encoding
|= (0xFF & instr
->definitions
[0].physReg()) << 18;
310 encoding
|= opcode
<< 16;
311 encoding
|= interp
->attribute
<< 10;
312 encoding
|= interp
->component
<< 8;
313 if (instr
->opcode
== aco_opcode::v_interp_mov_f32
)
314 encoding
|= (0x3 & instr
->operands
[0].constantValue());
316 encoding
|= (0xFF & instr
->operands
[0].physReg());
317 out
.push_back(encoding
);
322 DS_instruction
* ds
= static_cast<DS_instruction
*>(instr
);
323 uint32_t encoding
= (0b110110 << 26);
324 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
) {
325 encoding
|= opcode
<< 17;
326 encoding
|= (ds
->gds
? 1 : 0) << 16;
328 encoding
|= opcode
<< 18;
329 encoding
|= (ds
->gds
? 1 : 0) << 17;
331 encoding
|= ((0xFF & ds
->offset1
) << 8);
332 encoding
|= (0xFFFF & ds
->offset0
);
333 out
.push_back(encoding
);
335 unsigned reg
= !instr
->definitions
.empty() ? instr
->definitions
[0].physReg() : 0;
336 encoding
|= (0xFF & reg
) << 24;
337 reg
= instr
->operands
.size() >= 3 && !(instr
->operands
[2].physReg() == m0
) ? instr
->operands
[2].physReg() : 0;
338 encoding
|= (0xFF & reg
) << 16;
339 reg
= instr
->operands
.size() >= 2 && !(instr
->operands
[1].physReg() == m0
) ? instr
->operands
[1].physReg() : 0;
340 encoding
|= (0xFF & reg
) << 8;
341 encoding
|= (0xFF & instr
->operands
[0].physReg());
342 out
.push_back(encoding
);
345 case Format::MUBUF
: {
346 MUBUF_instruction
* mubuf
= static_cast<MUBUF_instruction
*>(instr
);
347 uint32_t encoding
= (0b111000 << 26);
348 encoding
|= opcode
<< 18;
349 encoding
|= (mubuf
->lds
? 1 : 0) << 16;
350 encoding
|= (mubuf
->glc
? 1 : 0) << 14;
351 encoding
|= (mubuf
->idxen
? 1 : 0) << 13;
352 assert(!mubuf
->addr64
|| ctx
.chip_class
<= GFX7
);
353 if (ctx
.chip_class
== GFX6
|| ctx
.chip_class
== GFX7
)
354 encoding
|= (mubuf
->addr64
? 1 : 0) << 15;
355 encoding
|= (mubuf
->offen
? 1 : 0) << 12;
356 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
) {
357 assert(!mubuf
->dlc
); /* Device-level coherent is not supported on GFX9 and lower */
358 encoding
|= (mubuf
->slc
? 1 : 0) << 17;
359 } else if (ctx
.chip_class
>= GFX10
) {
360 encoding
|= (mubuf
->dlc
? 1 : 0) << 15;
362 encoding
|= 0x0FFF & mubuf
->offset
;
363 out
.push_back(encoding
);
365 if (ctx
.chip_class
<= GFX7
|| ctx
.chip_class
>= GFX10
) {
366 encoding
|= (mubuf
->slc
? 1 : 0) << 22;
368 encoding
|= instr
->operands
[2].physReg() << 24;
369 encoding
|= (mubuf
->tfe
? 1 : 0) << 23;
370 encoding
|= (instr
->operands
[0].physReg() >> 2) << 16;
371 unsigned reg
= instr
->operands
.size() > 3 ? instr
->operands
[3].physReg() : instr
->definitions
[0].physReg();
372 encoding
|= (0xFF & reg
) << 8;
373 encoding
|= (0xFF & instr
->operands
[1].physReg());
374 out
.push_back(encoding
);
377 case Format::MTBUF
: {
378 MTBUF_instruction
* mtbuf
= static_cast<MTBUF_instruction
*>(instr
);
380 uint32_t img_format
= ac_get_tbuffer_format(ctx
.chip_class
, mtbuf
->dfmt
, mtbuf
->nfmt
);
381 uint32_t encoding
= (0b111010 << 26);
382 assert(img_format
<= 0x7F);
383 assert(!mtbuf
->dlc
|| ctx
.chip_class
>= GFX10
);
384 encoding
|= (mtbuf
->dlc
? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */
385 encoding
|= (mtbuf
->glc
? 1 : 0) << 14;
386 encoding
|= (mtbuf
->idxen
? 1 : 0) << 13;
387 encoding
|= (mtbuf
->offen
? 1 : 0) << 12;
388 encoding
|= 0x0FFF & mtbuf
->offset
;
389 encoding
|= (img_format
<< 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */
391 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
) {
392 encoding
|= opcode
<< 15;
394 encoding
|= (opcode
& 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */
397 out
.push_back(encoding
);
400 encoding
|= instr
->operands
[2].physReg() << 24;
401 encoding
|= (mtbuf
->tfe
? 1 : 0) << 23;
402 encoding
|= (mtbuf
->slc
? 1 : 0) << 22;
403 encoding
|= (instr
->operands
[0].physReg() >> 2) << 16;
404 unsigned reg
= instr
->operands
.size() > 3 ? instr
->operands
[3].physReg() : instr
->definitions
[0].physReg();
405 encoding
|= (0xFF & reg
) << 8;
406 encoding
|= (0xFF & instr
->operands
[1].physReg());
408 if (ctx
.chip_class
>= GFX10
) {
409 encoding
|= (((opcode
& 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */
412 out
.push_back(encoding
);
416 MIMG_instruction
* mimg
= static_cast<MIMG_instruction
*>(instr
);
417 uint32_t encoding
= (0b111100 << 26);
418 encoding
|= mimg
->slc
? 1 << 25 : 0;
419 encoding
|= opcode
<< 18;
420 encoding
|= mimg
->lwe
? 1 << 17 : 0;
421 encoding
|= mimg
->tfe
? 1 << 16 : 0;
422 encoding
|= mimg
->glc
? 1 << 13 : 0;
423 encoding
|= mimg
->unrm
? 1 << 12 : 0;
424 if (ctx
.chip_class
<= GFX9
) {
425 assert(!mimg
->dlc
); /* Device-level coherent is not supported on GFX9 and lower */
427 encoding
|= mimg
->a16
? 1 << 15 : 0;
428 encoding
|= mimg
->da
? 1 << 14 : 0;
430 encoding
|= mimg
->r128
? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
431 encoding
|= mimg
->dim
<< 3; /* GFX10: dimensionality instead of declare array */
432 encoding
|= mimg
->dlc
? 1 << 7 : 0;
434 encoding
|= (0xF & mimg
->dmask
) << 8;
435 out
.push_back(encoding
);
436 encoding
= (0xFF & instr
->operands
[2].physReg()); /* VADDR */
437 if (!instr
->definitions
.empty()) {
438 encoding
|= (0xFF & instr
->definitions
[0].physReg()) << 8; /* VDATA */
439 } else if (instr
->operands
[1].regClass().type() == RegType::vgpr
) {
440 encoding
|= (0xFF & instr
->operands
[1].physReg()) << 8; /* VDATA */
442 encoding
|= (0x1F & (instr
->operands
[0].physReg() >> 2)) << 16; /* T# (resource) */
443 if (instr
->operands
[1].regClass().type() == RegType::sgpr
)
444 encoding
|= (0x1F & (instr
->operands
[1].physReg() >> 2)) << 21; /* sampler */
446 assert(!mimg
->d16
|| ctx
.chip_class
>= GFX9
);
447 encoding
|= mimg
->d16
? 1 << 15 : 0;
448 if (ctx
.chip_class
>= GFX10
) {
449 encoding
|= mimg
->a16
? 1 << 14 : 0; /* GFX10: A16 still exists, but is in a different place */
452 out
.push_back(encoding
);
456 case Format::SCRATCH
:
457 case Format::GLOBAL
: {
458 FLAT_instruction
*flat
= static_cast<FLAT_instruction
*>(instr
);
459 uint32_t encoding
= (0b110111 << 26);
460 encoding
|= opcode
<< 18;
461 if (ctx
.chip_class
<= GFX9
) {
462 assert(flat
->offset
<= 0x1fff);
463 encoding
|= flat
->offset
& 0x1fff;
464 } else if (instr
->format
== Format::FLAT
) {
465 /* GFX10 has a 12-bit immediate OFFSET field,
466 * but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug
468 assert(flat
->offset
== 0);
470 assert(flat
->offset
<= 0xfff);
471 encoding
|= flat
->offset
& 0xfff;
473 if (instr
->format
== Format::SCRATCH
)
475 else if (instr
->format
== Format::GLOBAL
)
477 encoding
|= flat
->lds
? 1 << 13 : 0;
478 encoding
|= flat
->glc
? 1 << 16 : 0;
479 encoding
|= flat
->slc
? 1 << 17 : 0;
480 if (ctx
.chip_class
>= GFX10
) {
482 encoding
|= flat
->dlc
? 1 << 12 : 0;
486 out
.push_back(encoding
);
487 encoding
= (0xFF & instr
->operands
[0].physReg());
488 if (!instr
->definitions
.empty())
489 encoding
|= (0xFF & instr
->definitions
[0].physReg()) << 24;
490 if (instr
->operands
.size() >= 3)
491 encoding
|= (0xFF & instr
->operands
[2].physReg()) << 8;
492 if (!instr
->operands
[1].isUndefined()) {
493 assert(ctx
.chip_class
>= GFX10
|| instr
->operands
[1].physReg() != 0x7F);
494 assert(instr
->format
!= Format::FLAT
);
495 encoding
|= instr
->operands
[1].physReg() << 16;
496 } else if (instr
->format
!= Format::FLAT
|| ctx
.chip_class
>= GFX10
) { /* SADDR is actually used with FLAT on GFX10 */
497 if (ctx
.chip_class
<= GFX9
)
498 encoding
|= 0x7F << 16;
500 encoding
|= sgpr_null
<< 16;
502 encoding
|= flat
->nv
? 1 << 23 : 0;
503 out
.push_back(encoding
);
507 Export_instruction
* exp
= static_cast<Export_instruction
*>(instr
);
509 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
) {
510 encoding
= (0b110001 << 26);
512 encoding
= (0b111110 << 26);
515 encoding
|= exp
->valid_mask
? 0b1 << 12 : 0;
516 encoding
|= exp
->done
? 0b1 << 11 : 0;
517 encoding
|= exp
->compressed
? 0b1 << 10 : 0;
518 encoding
|= exp
->dest
<< 4;
519 encoding
|= exp
->enabled_mask
;
520 out
.push_back(encoding
);
521 encoding
= 0xFF & exp
->operands
[0].physReg();
522 encoding
|= (0xFF & exp
->operands
[1].physReg()) << 8;
523 encoding
|= (0xFF & exp
->operands
[2].physReg()) << 16;
524 encoding
|= (0xFF & exp
->operands
[3].physReg()) << 24;
525 out
.push_back(encoding
);
529 case Format::PSEUDO_BARRIER
:
530 if (instr
->opcode
!= aco_opcode::p_unit_test
)
531 unreachable("Pseudo instructions should be lowered before assembly.");
534 if ((uint16_t) instr
->format
& (uint16_t) Format::VOP3A
) {
535 VOP3A_instruction
* vop3
= static_cast<VOP3A_instruction
*>(instr
);
537 if ((uint16_t) instr
->format
& (uint16_t) Format::VOP2
) {
538 opcode
= opcode
+ 0x100;
539 } else if ((uint16_t) instr
->format
& (uint16_t) Format::VOP1
) {
540 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
)
541 opcode
= opcode
+ 0x140;
543 opcode
= opcode
+ 0x180;
544 } else if ((uint16_t) instr
->format
& (uint16_t) Format::VOPC
) {
545 opcode
= opcode
+ 0x0;
546 } else if ((uint16_t) instr
->format
& (uint16_t) Format::VINTRP
) {
547 opcode
= opcode
+ 0x270;
551 if (ctx
.chip_class
<= GFX9
) {
552 encoding
= (0b110100 << 26);
553 } else if (ctx
.chip_class
>= GFX10
) {
554 encoding
= (0b110101 << 26);
556 unreachable("Unknown chip_class.");
559 if (ctx
.chip_class
<= GFX7
) {
560 encoding
|= opcode
<< 17;
561 encoding
|= (vop3
->clamp
? 1 : 0) << 11;
563 encoding
|= opcode
<< 16;
564 encoding
|= (vop3
->clamp
? 1 : 0) << 15;
566 encoding
|= vop3
->opsel
<< 11;
567 for (unsigned i
= 0; i
< 3; i
++)
568 encoding
|= vop3
->abs
[i
] << (8+i
);
569 if (instr
->definitions
.size() == 2)
570 encoding
|= instr
->definitions
[1].physReg() << 8;
571 encoding
|= (0xFF & instr
->definitions
[0].physReg());
572 out
.push_back(encoding
);
574 if (instr
->opcode
== aco_opcode::v_interp_mov_f32
) {
575 encoding
= 0x3 & instr
->operands
[0].constantValue();
577 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++)
578 encoding
|= instr
->operands
[i
].physReg() << (i
* 9);
580 encoding
|= vop3
->omod
<< 27;
581 for (unsigned i
= 0; i
< 3; i
++)
582 encoding
|= vop3
->neg
[i
] << (29+i
);
583 out
.push_back(encoding
);
585 } else if (instr
->format
== Format::VOP3P
) {
586 VOP3P_instruction
* vop3
= static_cast<VOP3P_instruction
*>(instr
);
589 if (ctx
.chip_class
== GFX9
) {
590 encoding
= (0b110100111 << 23);
591 } else if (ctx
.chip_class
>= GFX10
) {
592 encoding
= (0b110011 << 26);
594 unreachable("Unknown chip_class.");
597 encoding
|= opcode
<< 16;
598 encoding
|= (vop3
->clamp
? 1 : 0) << 15;
599 encoding
|= vop3
->opsel_lo
<< 11;
600 encoding
|= (vop3
->opsel_hi
& 0x4) ? 1 : 0 << 14;
601 for (unsigned i
= 0; i
< 3; i
++)
602 encoding
|= vop3
->neg_hi
[i
] << (8+i
);
603 encoding
|= (0xFF & instr
->definitions
[0].physReg());
604 out
.push_back(encoding
);
606 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++)
607 encoding
|= instr
->operands
[i
].physReg() << (i
* 9);
608 encoding
|= vop3
->opsel_hi
& 0x3 << 27;
609 for (unsigned i
= 0; i
< 3; i
++)
610 encoding
|= vop3
->neg_lo
[i
] << (29+i
);
611 out
.push_back(encoding
);
613 } else if (instr
->isDPP()){
614 assert(ctx
.chip_class
>= GFX8
);
615 /* first emit the instruction without the DPP operand */
616 Operand dpp_op
= instr
->operands
[0];
617 instr
->operands
[0] = Operand(PhysReg
{250}, v1
);
618 instr
->format
= (Format
) ((uint16_t) instr
->format
& ~(uint16_t)Format::DPP
);
619 emit_instruction(ctx
, out
, instr
);
620 DPP_instruction
* dpp
= static_cast<DPP_instruction
*>(instr
);
621 uint32_t encoding
= (0xF & dpp
->row_mask
) << 28;
622 encoding
|= (0xF & dpp
->bank_mask
) << 24;
623 encoding
|= dpp
->abs
[1] << 23;
624 encoding
|= dpp
->neg
[1] << 22;
625 encoding
|= dpp
->abs
[0] << 21;
626 encoding
|= dpp
->neg
[0] << 20;
627 if (ctx
.chip_class
>= GFX10
)
628 encoding
|= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */
629 encoding
|= dpp
->bound_ctrl
<< 19;
630 encoding
|= dpp
->dpp_ctrl
<< 8;
631 encoding
|= (0xFF) & dpp_op
.physReg();
632 out
.push_back(encoding
);
634 } else if (instr
->isSDWA()) {
635 /* first emit the instruction without the SDWA operand */
636 Operand sdwa_op
= instr
->operands
[0];
637 instr
->operands
[0] = Operand(PhysReg
{249}, v1
);
638 instr
->format
= (Format
) ((uint16_t) instr
->format
& ~(uint16_t)Format::SDWA
);
639 emit_instruction(ctx
, out
, instr
);
641 SDWA_instruction
* sdwa
= static_cast<SDWA_instruction
*>(instr
);
642 uint32_t encoding
= 0;
644 if ((uint16_t)instr
->format
& (uint16_t)Format::VOPC
) {
645 if (instr
->definitions
[0].physReg() != vcc
) {
646 encoding
|= instr
->definitions
[0].physReg() << 8;
649 encoding
|= (sdwa
->clamp
? 1 : 0) << 13;
651 encoding
|= get_sdwa_sel(sdwa
->dst_sel
, instr
->definitions
[0].physReg()) << 8;
652 uint32_t dst_u
= sdwa
->dst_sel
& sdwa_sext
? 1 : 0;
653 if (sdwa
->dst_preserve
|| (sdwa
->dst_sel
& sdwa_isra
))
655 encoding
|= dst_u
<< 11;
656 encoding
|= (sdwa
->clamp
? 1 : 0) << 13;
657 encoding
|= sdwa
->omod
<< 14;
660 encoding
|= get_sdwa_sel(sdwa
->sel
[0], sdwa_op
.physReg()) << 16;
661 encoding
|= sdwa
->sel
[0] & sdwa_sext
? 1 << 19 : 0;
662 encoding
|= sdwa
->abs
[0] << 21;
663 encoding
|= sdwa
->neg
[0] << 20;
665 if (instr
->operands
.size() >= 2) {
666 encoding
|= get_sdwa_sel(sdwa
->sel
[1], instr
->operands
[1].physReg()) << 24;
667 encoding
|= sdwa
->sel
[1] & sdwa_sext
? 1 << 27 : 0;
668 encoding
|= sdwa
->abs
[1] << 29;
669 encoding
|= sdwa
->neg
[1] << 28;
672 encoding
|= 0xFF & sdwa_op
.physReg();
673 encoding
|= (sdwa_op
.physReg() < 256) << 23;
674 if (instr
->operands
.size() >= 2)
675 encoding
|= (instr
->operands
[1].physReg() < 256) << 31;
676 out
.push_back(encoding
);
678 unreachable("unimplemented instruction format");
683 /* append literal dword */
684 for (const Operand
& op
: instr
->operands
) {
685 if (op
.isLiteral()) {
686 out
.push_back(op
.constantValue());
692 void emit_block(asm_context
& ctx
, std::vector
<uint32_t>& out
, Block
& block
)
694 for (aco_ptr
<Instruction
>& instr
: block
.instructions
) {
696 int start_idx
= out
.size();
697 std::cerr
<< "Encoding:\t" << std::endl
;
698 aco_print_instr(&*instr
, stderr
);
699 std::cerr
<< std::endl
;
701 emit_instruction(ctx
, out
, instr
.get());
703 for (int i
= start_idx
; i
< out
.size(); i
++)
704 std::cerr
<< "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex
<< out
[i
] << std::endl
;
709 void fix_exports(asm_context
& ctx
, std::vector
<uint32_t>& out
, Program
* program
)
711 bool exported
= false;
712 for (Block
& block
: program
->blocks
) {
713 if (!(block
.kind
& block_kind_export_end
))
715 std::vector
<aco_ptr
<Instruction
>>::reverse_iterator it
= block
.instructions
.rbegin();
716 while ( it
!= block
.instructions
.rend())
718 if ((*it
)->format
== Format::EXP
) {
719 Export_instruction
* exp
= static_cast<Export_instruction
*>((*it
).get());
720 if (program
->stage
& (hw_vs
| hw_ngg_gs
)) {
721 if (exp
->dest
>= V_008DFC_SQ_EXP_POS
&& exp
->dest
<= (V_008DFC_SQ_EXP_POS
+ 3)) {
728 exp
->valid_mask
= true;
732 } else if ((*it
)->definitions
.size() && (*it
)->definitions
[0].physReg() == exec
)
739 /* Abort in order to avoid a GPU hang. */
740 fprintf(stderr
, "Missing export in %s shader:\n", (program
->stage
& hw_vs
) ? "vertex" : "fragment");
741 aco_print_program(program
, stderr
);
746 static void fix_branches_gfx10(asm_context
& ctx
, std::vector
<uint32_t>& out
)
748 /* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */
749 bool gfx10_3f_bug
= false;
752 auto buggy_branch_it
= std::find_if(ctx
.branches
.begin(), ctx
.branches
.end(), [&ctx
](const auto &branch
) -> bool {
753 return ((int)ctx
.program
->blocks
[branch
.second
->block
].offset
- branch
.first
- 1) == 0x3f;
756 gfx10_3f_bug
= buggy_branch_it
!= ctx
.branches
.end();
759 /* Insert an s_nop after the branch */
760 constexpr uint32_t s_nop_0
= 0xbf800000u
;
761 int s_nop_pos
= buggy_branch_it
->first
+ 1;
762 auto out_pos
= std::next(out
.begin(), s_nop_pos
);
763 out
.insert(out_pos
, s_nop_0
);
765 /* Update the offset of each affected block */
766 for (Block
& block
: ctx
.program
->blocks
) {
767 if (block
.offset
> (unsigned)buggy_branch_it
->first
)
771 /* Update the branches following the current one */
772 for (auto branch_it
= std::next(buggy_branch_it
); branch_it
!= ctx
.branches
.end(); ++branch_it
)
775 /* Find first constant address after the inserted instruction */
776 auto caddr_it
= std::find_if(ctx
.constaddrs
.begin(), ctx
.constaddrs
.end(), [s_nop_pos
](const int &caddr_pos
) -> bool {
777 return caddr_pos
>= s_nop_pos
;
780 /* Update the locations of constant addresses */
781 for (; caddr_it
!= ctx
.constaddrs
.end(); ++caddr_it
)
785 } while (gfx10_3f_bug
);
788 void fix_branches(asm_context
& ctx
, std::vector
<uint32_t>& out
)
790 if (ctx
.chip_class
== GFX10
)
791 fix_branches_gfx10(ctx
, out
);
793 for (std::pair
<int, SOPP_instruction
*> &branch
: ctx
.branches
) {
794 int offset
= (int)ctx
.program
->blocks
[branch
.second
->block
].offset
- branch
.first
- 1;
795 out
[branch
.first
] |= (uint16_t) offset
;
799 void fix_constaddrs(asm_context
& ctx
, std::vector
<uint32_t>& out
)
801 for (unsigned addr
: ctx
.constaddrs
)
802 out
[addr
] += (out
.size() - addr
+ 1u) * 4u;
805 unsigned emit_program(Program
* program
,
806 std::vector
<uint32_t>& code
)
808 asm_context
ctx(program
);
810 if (program
->stage
& (hw_vs
| hw_fs
| hw_ngg_gs
))
811 fix_exports(ctx
, code
, program
);
813 for (Block
& block
: program
->blocks
) {
814 block
.offset
= code
.size();
815 emit_block(ctx
, code
, block
);
818 fix_branches(ctx
, code
);
820 unsigned exec_size
= code
.size() * sizeof(uint32_t);
822 if (program
->chip_class
>= GFX10
) {
823 /* Pad output with s_code_end so instruction prefetching doesn't cause
825 unsigned final_size
= align(code
.size() + 3 * 16, 16);
826 while (code
.size() < final_size
)
827 code
.push_back(0xbf9f0000u
);
830 fix_constaddrs(ctx
, code
);
832 while (program
->constant_data
.size() % 4u)
833 program
->constant_data
.push_back(0);
834 /* Copy constant data */
835 code
.insert(code
.end(), (uint32_t*)program
->constant_data
.data(),
836 (uint32_t*)(program
->constant_data
.data() + program
->constant_data
.size()));