4 #include "common/sid.h"
5 #include "ac_shader_util.h"
11 enum chip_class chip_class
;
12 std::map
<int, SOPP_instruction
*> branches
;
13 std::vector
<unsigned> constaddrs
;
14 const int16_t* opcode
;
15 // TODO: keep track of branch instructions referring blocks
16 // and, when emitting the block, correct the offset in instr
17 asm_context(Program
* program
) : program(program
), chip_class(program
->chip_class
) {
18 if (chip_class
<= GFX9
)
19 opcode
= &instr_info
.opcode_gfx9
[0];
20 else if (chip_class
== GFX10
)
21 opcode
= &instr_info
.opcode_gfx10
[0];
24 int subvector_begin_pos
= -1;
27 void emit_instruction(asm_context
& ctx
, std::vector
<uint32_t>& out
, Instruction
* instr
)
29 uint32_t instr_offset
= out
.size() * 4u;
31 /* lower remaining pseudo-instructions */
32 if (instr
->opcode
== aco_opcode::p_constaddr
) {
33 unsigned dest
= instr
->definitions
[0].physReg();
34 unsigned offset
= instr
->operands
[0].constantValue();
36 /* s_getpc_b64 dest[0:1] */
37 uint32_t encoding
= (0b101111101 << 23);
38 uint32_t opcode
= ctx
.opcode
[(int)aco_opcode::s_getpc_b64
];
39 if (opcode
>= 55 && ctx
.chip_class
<= GFX9
) {
40 assert(ctx
.chip_class
== GFX9
&& opcode
< 60);
43 encoding
|= dest
<< 16;
44 encoding
|= opcode
<< 8;
45 out
.push_back(encoding
);
47 /* s_add_u32 dest[0], dest[0], ... */
48 encoding
= (0b10 << 30);
49 encoding
|= ctx
.opcode
[(int)aco_opcode::s_add_u32
] << 23;
50 encoding
|= dest
<< 16;
53 out
.push_back(encoding
);
54 ctx
.constaddrs
.push_back(out
.size());
55 out
.push_back(-(instr_offset
+ 4) + offset
);
57 /* s_addc_u32 dest[1], dest[1], 0 */
58 encoding
= (0b10 << 30);
59 encoding
|= ctx
.opcode
[(int)aco_opcode::s_addc_u32
] << 23;
60 encoding
|= (dest
+ 1) << 16;
63 out
.push_back(encoding
);
67 uint32_t opcode
= ctx
.opcode
[(int)instr
->opcode
];
68 if (opcode
== (uint32_t)-1) {
69 fprintf(stderr
, "Unsupported opcode: ");
70 aco_print_instr(instr
, stderr
);
74 switch (instr
->format
) {
76 uint32_t encoding
= (0b10 << 30);
77 encoding
|= opcode
<< 23;
78 encoding
|= !instr
->definitions
.empty() ? instr
->definitions
[0].physReg() << 16 : 0;
79 encoding
|= instr
->operands
.size() >= 2 ? instr
->operands
[1].physReg() << 8 : 0;
80 encoding
|= !instr
->operands
.empty() ? instr
->operands
[0].physReg() : 0;
81 out
.push_back(encoding
);
85 SOPK_instruction
*sopk
= static_cast<SOPK_instruction
*>(instr
);
87 if (instr
->opcode
== aco_opcode::s_subvector_loop_begin
) {
88 assert(ctx
.chip_class
>= GFX10
);
89 assert(ctx
.subvector_begin_pos
== -1);
90 ctx
.subvector_begin_pos
= out
.size();
91 } else if (instr
->opcode
== aco_opcode::s_subvector_loop_end
) {
92 assert(ctx
.chip_class
>= GFX10
);
93 assert(ctx
.subvector_begin_pos
!= -1);
94 /* Adjust s_subvector_loop_begin instruction to the address after the end */
95 out
[ctx
.subvector_begin_pos
] |= (out
.size() - ctx
.subvector_begin_pos
);
96 /* Adjust s_subvector_loop_end instruction to the address after the beginning */
97 sopk
->imm
= (uint16_t)(ctx
.subvector_begin_pos
- (int)out
.size());
98 ctx
.subvector_begin_pos
= -1;
101 uint32_t encoding
= (0b1011 << 28);
102 encoding
|= opcode
<< 23;
104 !instr
->definitions
.empty() && !(instr
->definitions
[0].physReg() == scc
) ?
105 instr
->definitions
[0].physReg() << 16 :
106 !instr
->operands
.empty() && !(instr
->operands
[0].physReg() == scc
) ?
107 instr
->operands
[0].physReg() << 16 : 0;
108 encoding
|= sopk
->imm
;
109 out
.push_back(encoding
);
113 uint32_t encoding
= (0b101111101 << 23);
114 if (opcode
>= 55 && ctx
.chip_class
<= GFX9
) {
115 assert(ctx
.chip_class
== GFX9
&& opcode
< 60);
118 encoding
|= !instr
->definitions
.empty() ? instr
->definitions
[0].physReg() << 16 : 0;
119 encoding
|= opcode
<< 8;
120 encoding
|= !instr
->operands
.empty() ? instr
->operands
[0].physReg() : 0;
121 out
.push_back(encoding
);
125 uint32_t encoding
= (0b101111110 << 23);
126 encoding
|= opcode
<< 16;
127 encoding
|= instr
->operands
.size() == 2 ? instr
->operands
[1].physReg() << 8 : 0;
128 encoding
|= !instr
->operands
.empty() ? instr
->operands
[0].physReg() : 0;
129 out
.push_back(encoding
);
133 SOPP_instruction
* sopp
= static_cast<SOPP_instruction
*>(instr
);
134 uint32_t encoding
= (0b101111111 << 23);
135 encoding
|= opcode
<< 16;
136 encoding
|= (uint16_t) sopp
->imm
;
137 if (sopp
->block
!= -1)
138 ctx
.branches
.insert({out
.size(), sopp
});
139 out
.push_back(encoding
);
143 SMEM_instruction
* smem
= static_cast<SMEM_instruction
*>(instr
);
144 bool soe
= instr
->operands
.size() >= (!instr
->definitions
.empty() ? 3 : 4);
145 bool is_load
= !instr
->definitions
.empty();
147 uint32_t encoding
= 0;
149 if (ctx
.chip_class
<= GFX9
) {
150 encoding
= (0b110000 << 26);
151 assert(!smem
->dlc
); /* Device-level coherent is not supported on GFX9 and lower */
152 encoding
|= smem
->nv
? 1 << 15 : 0;
154 encoding
= (0b111101 << 26);
155 assert(!smem
->nv
); /* Non-volatile is not supported on GFX10 */
156 encoding
|= smem
->dlc
? 1 << 14 : 0;
159 encoding
|= opcode
<< 18;
160 encoding
|= smem
->glc
? 1 << 16 : 0;
162 if (ctx
.chip_class
<= GFX9
) {
163 if (instr
->operands
.size() >= 2)
164 encoding
|= instr
->operands
[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */
166 if (ctx
.chip_class
== GFX9
) {
167 encoding
|= soe
? 1 << 14 : 0;
170 if (is_load
|| instr
->operands
.size() >= 3) { /* SDATA */
171 encoding
|= (is_load
? instr
->definitions
[0].physReg().reg
: instr
->operands
[2].physReg().reg
) << 6;
173 if (instr
->operands
.size() >= 1) { /* SBASE */
174 encoding
|= instr
->operands
[0].physReg().reg
>> 1;
177 out
.push_back(encoding
);
181 uint32_t soffset
= ctx
.chip_class
>= GFX10
182 ? sgpr_null
/* On GFX10 this is disabled by specifying SGPR_NULL */
183 : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */
184 if (instr
->operands
.size() >= 2) {
185 const Operand
&op_off1
= instr
->operands
[1];
186 if (ctx
.chip_class
<= GFX9
) {
187 offset
= op_off1
.isConstant() ? op_off1
.constantValue() : op_off1
.physReg();
189 /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */
190 if (op_off1
.isConstant()) {
191 offset
= op_off1
.constantValue();
193 soffset
= op_off1
.physReg();
194 assert(!soe
); /* There is no place to put the other SGPR offset, if any */
199 const Operand
&op_off2
= instr
->operands
.back();
200 assert(ctx
.chip_class
>= GFX9
); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */
201 assert(!op_off2
.isConstant());
202 soffset
= op_off2
.physReg();
206 encoding
|= soffset
<< 25;
208 out
.push_back(encoding
);
212 uint32_t encoding
= 0;
213 encoding
|= opcode
<< 25;
214 encoding
|= (0xFF & instr
->definitions
[0].physReg().reg
) << 17;
215 encoding
|= (0xFF & instr
->operands
[1].physReg().reg
) << 9;
216 encoding
|= instr
->operands
[0].physReg().reg
;
217 out
.push_back(encoding
);
221 uint32_t encoding
= (0b0111111 << 25);
222 encoding
|= (0xFF & instr
->definitions
[0].physReg().reg
) << 17;
223 encoding
|= opcode
<< 9;
224 encoding
|= instr
->operands
[0].physReg().reg
;
225 out
.push_back(encoding
);
229 uint32_t encoding
= (0b0111110 << 25);
230 encoding
|= opcode
<< 17;
231 encoding
|= (0xFF & instr
->operands
[1].physReg().reg
) << 9;
232 encoding
|= instr
->operands
[0].physReg().reg
;
233 out
.push_back(encoding
);
236 case Format::VINTRP
: {
237 Interp_instruction
* interp
= static_cast<Interp_instruction
*>(instr
);
238 uint32_t encoding
= 0;
240 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
) {
241 encoding
= (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */
243 encoding
= (0b110010 << 26);
247 encoding
|= (0xFF & instr
->definitions
[0].physReg().reg
) << 18;
248 encoding
|= opcode
<< 16;
249 encoding
|= interp
->attribute
<< 10;
250 encoding
|= interp
->component
<< 8;
251 if (instr
->opcode
== aco_opcode::v_interp_mov_f32
)
252 encoding
|= (0x3 & instr
->operands
[0].constantValue());
254 encoding
|= (0xFF & instr
->operands
[0].physReg().reg
);
255 out
.push_back(encoding
);
259 DS_instruction
* ds
= static_cast<DS_instruction
*>(instr
);
260 uint32_t encoding
= (0b110110 << 26);
261 if (ctx
.chip_class
== GFX8
|| ctx
.chip_class
== GFX9
) {
262 encoding
|= opcode
<< 17;
263 encoding
|= (ds
->gds
? 1 : 0) << 16;
265 encoding
|= opcode
<< 18;
266 encoding
|= (ds
->gds
? 1 : 0) << 17;
268 encoding
|= ((0xFF & ds
->offset1
) << 8);
269 encoding
|= (0xFFFF & ds
->offset0
);
270 out
.push_back(encoding
);
272 unsigned reg
= !instr
->definitions
.empty() ? instr
->definitions
[0].physReg() : 0;
273 encoding
|= (0xFF & reg
) << 24;
274 reg
= instr
->operands
.size() >= 3 && !(instr
->operands
[2].physReg() == m0
) ? instr
->operands
[2].physReg() : 0;
275 encoding
|= (0xFF & reg
) << 16;
276 reg
= instr
->operands
.size() >= 2 && !(instr
->operands
[1].physReg() == m0
) ? instr
->operands
[1].physReg() : 0;
277 encoding
|= (0xFF & reg
) << 8;
278 encoding
|= (0xFF & instr
->operands
[0].physReg().reg
);
279 out
.push_back(encoding
);
282 case Format::MUBUF
: {
283 MUBUF_instruction
* mubuf
= static_cast<MUBUF_instruction
*>(instr
);
284 uint32_t encoding
= (0b111000 << 26);
285 encoding
|= opcode
<< 18;
286 encoding
|= (mubuf
->lds
? 1 : 0) << 16;
287 encoding
|= (mubuf
->glc
? 1 : 0) << 14;
288 encoding
|= (mubuf
->idxen
? 1 : 0) << 13;
289 encoding
|= (mubuf
->offen
? 1 : 0) << 12;
290 if (ctx
.chip_class
<= GFX9
) {
291 assert(!mubuf
->dlc
); /* Device-level coherent is not supported on GFX9 and lower */
292 encoding
|= (mubuf
->slc
? 1 : 0) << 17;
293 } else if (ctx
.chip_class
>= GFX10
) {
294 encoding
|= (mubuf
->dlc
? 1 : 0) << 15;
296 encoding
|= 0x0FFF & mubuf
->offset
;
297 out
.push_back(encoding
);
299 if (ctx
.chip_class
>= GFX10
) {
300 encoding
|= (mubuf
->slc
? 1 : 0) << 22;
302 encoding
|= instr
->operands
[2].physReg() << 24;
303 encoding
|= (mubuf
->tfe
? 1 : 0) << 23;
304 encoding
|= (instr
->operands
[1].physReg() >> 2) << 16;
305 unsigned reg
= instr
->operands
.size() > 3 ? instr
->operands
[3].physReg() : instr
->definitions
[0].physReg().reg
;
306 encoding
|= (0xFF & reg
) << 8;
307 encoding
|= (0xFF & instr
->operands
[0].physReg().reg
);
308 out
.push_back(encoding
);
311 case Format::MTBUF
: {
312 MTBUF_instruction
* mtbuf
= static_cast<MTBUF_instruction
*>(instr
);
314 uint32_t img_format
= ac_get_tbuffer_format(ctx
.chip_class
, mtbuf
->dfmt
, mtbuf
->nfmt
);
315 uint32_t encoding
= (0b111010 << 26);
316 assert(!mtbuf
->dlc
|| ctx
.chip_class
>= GFX10
);
317 encoding
|= (mtbuf
->dlc
? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */
318 encoding
|= (mtbuf
->glc
? 1 : 0) << 14;
319 encoding
|= (mtbuf
->idxen
? 1 : 0) << 13;
320 encoding
|= (mtbuf
->offen
? 1 : 0) << 12;
321 encoding
|= 0x0FFF & mtbuf
->offset
;
322 encoding
|= (img_format
<< 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */
324 if (ctx
.chip_class
<= GFX9
) {
325 encoding
|= opcode
<< 15;
327 encoding
|= (opcode
& 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */
330 out
.push_back(encoding
);
333 encoding
|= instr
->operands
[2].physReg().reg
<< 24;
334 encoding
|= (mtbuf
->tfe
? 1 : 0) << 23;
335 encoding
|= (mtbuf
->slc
? 1 : 0) << 22;
336 encoding
|= (instr
->operands
[1].physReg().reg
>> 2) << 16;
337 unsigned reg
= instr
->operands
.size() > 3 ? instr
->operands
[3].physReg().reg
: instr
->definitions
[0].physReg().reg
;
338 encoding
|= (0xFF & reg
) << 8;
339 encoding
|= (0xFF & instr
->operands
[0].physReg().reg
);
341 if (ctx
.chip_class
>= GFX10
) {
342 encoding
|= (((opcode
& 0x08) >> 4) << 21); /* MSB of 4-bit OPCODE */
345 out
.push_back(encoding
);
349 MIMG_instruction
* mimg
= static_cast<MIMG_instruction
*>(instr
);
350 uint32_t encoding
= (0b111100 << 26);
351 encoding
|= mimg
->slc
? 1 << 25 : 0;
352 encoding
|= opcode
<< 18;
353 encoding
|= mimg
->lwe
? 1 << 17 : 0;
354 encoding
|= mimg
->tfe
? 1 << 16 : 0;
355 encoding
|= mimg
->glc
? 1 << 13 : 0;
356 encoding
|= mimg
->unrm
? 1 << 12 : 0;
357 if (ctx
.chip_class
<= GFX9
) {
358 assert(!mimg
->dlc
); /* Device-level coherent is not supported on GFX9 and lower */
360 encoding
|= mimg
->a16
? 1 << 15 : 0;
361 encoding
|= mimg
->da
? 1 << 14 : 0;
363 encoding
|= mimg
->r128
? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
364 encoding
|= mimg
->dim
<< 3; /* GFX10: dimensionality instead of declare array */
365 encoding
|= mimg
->dlc
? 1 << 7 : 0;
367 encoding
|= (0xF & mimg
->dmask
) << 8;
368 out
.push_back(encoding
);
369 encoding
= (0xFF & instr
->operands
[0].physReg().reg
); /* VADDR */
370 if (!instr
->definitions
.empty()) {
371 encoding
|= (0xFF & instr
->definitions
[0].physReg().reg
) << 8; /* VDATA */
372 } else if (instr
->operands
.size() == 4) {
373 encoding
|= (0xFF & instr
->operands
[3].physReg().reg
) << 8; /* VDATA */
375 encoding
|= (0x1F & (instr
->operands
[1].physReg() >> 2)) << 16; /* T# (resource) */
376 if (instr
->operands
.size() > 2)
377 encoding
|= (0x1F & (instr
->operands
[2].physReg() >> 2)) << 21; /* sampler */
379 assert(!mimg
->d16
|| ctx
.chip_class
>= GFX9
);
380 encoding
|= mimg
->d16
? 1 << 15 : 0;
381 if (ctx
.chip_class
>= GFX10
) {
382 encoding
|= mimg
->a16
? 1 << 14 : 0; /* GFX10: A16 still exists, but is in a different place */
385 out
.push_back(encoding
);
389 case Format::SCRATCH
:
390 case Format::GLOBAL
: {
391 FLAT_instruction
*flat
= static_cast<FLAT_instruction
*>(instr
);
392 uint32_t encoding
= (0b110111 << 26);
393 encoding
|= opcode
<< 18;
394 if (ctx
.chip_class
<= GFX9
) {
395 assert(flat
->offset
<= 0x1fff);
396 encoding
|= flat
->offset
& 0x1fff;
398 assert(flat
->offset
<= 0x0fff);
399 encoding
|= flat
->offset
& 0x0fff;
401 if (instr
->format
== Format::SCRATCH
)
403 else if (instr
->format
== Format::GLOBAL
)
405 encoding
|= flat
->lds
? 1 << 13 : 0;
406 encoding
|= flat
->glc
? 1 << 16 : 0;
407 encoding
|= flat
->slc
? 1 << 17 : 0;
408 if (ctx
.chip_class
>= GFX10
) {
410 encoding
|= flat
->dlc
? 1 << 12 : 0;
414 out
.push_back(encoding
);
415 encoding
= (0xFF & instr
->operands
[0].physReg());
416 if (!instr
->definitions
.empty())
417 encoding
|= (0xFF & instr
->definitions
[0].physReg()) << 24;
419 encoding
|= (0xFF & instr
->operands
[2].physReg()) << 8;
420 if (!instr
->operands
[1].isUndefined()) {
421 assert(ctx
.chip_class
>= GFX10
|| instr
->operands
[1].physReg() != 0x7F);
422 assert(instr
->format
!= Format::FLAT
);
423 encoding
|= instr
->operands
[1].physReg() << 16;
424 } else if (instr
->format
!= Format::FLAT
) {
425 if (ctx
.chip_class
<= GFX9
)
426 encoding
|= 0x7F << 16;
428 encoding
|= sgpr_null
<< 16;
430 encoding
|= flat
->nv
? 1 << 23 : 0;
431 out
.push_back(encoding
);
435 Export_instruction
* exp
= static_cast<Export_instruction
*>(instr
);
437 if (ctx
.chip_class
<= GFX9
) {
438 encoding
= (0b110001 << 26);
439 } else if (ctx
.chip_class
>= GFX10
) {
440 encoding
= (0b111110 << 26);
443 encoding
|= exp
->valid_mask
? 0b1 << 12 : 0;
444 encoding
|= exp
->done
? 0b1 << 11 : 0;
445 encoding
|= exp
->compressed
? 0b1 << 10 : 0;
446 encoding
|= exp
->dest
<< 4;
447 encoding
|= exp
->enabled_mask
;
448 out
.push_back(encoding
);
449 encoding
= 0xFF & exp
->operands
[0].physReg().reg
;
450 encoding
|= (0xFF & exp
->operands
[1].physReg().reg
) << 8;
451 encoding
|= (0xFF & exp
->operands
[2].physReg().reg
) << 16;
452 encoding
|= (0xFF & exp
->operands
[3].physReg().reg
) << 24;
453 out
.push_back(encoding
);
457 case Format::PSEUDO_BARRIER
:
458 unreachable("Pseudo instructions should be lowered before assembly.");
460 if ((uint16_t) instr
->format
& (uint16_t) Format::VOP3A
) {
461 VOP3A_instruction
* vop3
= static_cast<VOP3A_instruction
*>(instr
);
463 if ((uint16_t) instr
->format
& (uint16_t) Format::VOP2
) {
464 opcode
= opcode
+ 0x100;
465 } else if ((uint16_t) instr
->format
& (uint16_t) Format::VOP1
) {
466 if (ctx
.chip_class
<= GFX9
) {
467 opcode
= opcode
+ 0x140;
469 /* RDNA ISA doc says this is 0x140, but that doesn't work */
470 opcode
= opcode
+ 0x180;
472 } else if ((uint16_t) instr
->format
& (uint16_t) Format::VOPC
) {
473 opcode
= opcode
+ 0x0;
474 } else if ((uint16_t) instr
->format
& (uint16_t) Format::VINTRP
) {
475 opcode
= opcode
+ 0x270;
479 if (ctx
.chip_class
<= GFX9
) {
480 encoding
= (0b110100 << 26);
481 } else if (ctx
.chip_class
== GFX10
) {
482 encoding
= (0b110101 << 26);
485 encoding
|= opcode
<< 16;
486 encoding
|= (vop3
->clamp
? 1 : 0) << 15;
487 for (unsigned i
= 0; i
< 3; i
++)
488 encoding
|= vop3
->abs
[i
] << (8+i
);
489 for (unsigned i
= 0; i
< 4; i
++)
490 encoding
|= vop3
->opsel
[i
] << (11+i
);
491 if (instr
->definitions
.size() == 2)
492 encoding
|= instr
->definitions
[1].physReg() << 8;
493 encoding
|= (0xFF & instr
->definitions
[0].physReg().reg
);
494 out
.push_back(encoding
);
496 if (instr
->opcode
== aco_opcode::v_interp_mov_f32
) {
497 encoding
= 0x3 & instr
->operands
[0].constantValue();
499 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++)
500 encoding
|= instr
->operands
[i
].physReg() << (i
* 9);
502 encoding
|= vop3
->omod
<< 27;
503 for (unsigned i
= 0; i
< 3; i
++)
504 encoding
|= vop3
->neg
[i
] << (29+i
);
505 out
.push_back(encoding
);
507 } else if (instr
->isDPP()){
508 /* first emit the instruction without the DPP operand */
509 Operand dpp_op
= instr
->operands
[0];
510 instr
->operands
[0] = Operand(PhysReg
{250}, v1
);
511 instr
->format
= (Format
) ((uint32_t) instr
->format
& ~(1 << 14));
512 emit_instruction(ctx
, out
, instr
);
513 DPP_instruction
* dpp
= static_cast<DPP_instruction
*>(instr
);
514 uint32_t encoding
= (0xF & dpp
->row_mask
) << 28;
515 encoding
|= (0xF & dpp
->bank_mask
) << 24;
516 encoding
|= dpp
->abs
[1] << 23;
517 encoding
|= dpp
->neg
[1] << 22;
518 encoding
|= dpp
->abs
[0] << 21;
519 encoding
|= dpp
->neg
[0] << 20;
520 encoding
|= dpp
->bound_ctrl
<< 19;
521 encoding
|= dpp
->dpp_ctrl
<< 8;
522 encoding
|= (0xFF) & dpp_op
.physReg().reg
;
523 out
.push_back(encoding
);
526 unreachable("unimplemented instruction format");
531 /* append literal dword */
532 for (const Operand
& op
: instr
->operands
) {
533 if (op
.isLiteral()) {
534 out
.push_back(op
.constantValue());
540 void emit_block(asm_context
& ctx
, std::vector
<uint32_t>& out
, Block
& block
)
542 for (aco_ptr
<Instruction
>& instr
: block
.instructions
) {
544 int start_idx
= out
.size();
545 std::cerr
<< "Encoding:\t" << std::endl
;
546 aco_print_instr(&*instr
, stderr
);
547 std::cerr
<< std::endl
;
549 emit_instruction(ctx
, out
, instr
.get());
551 for (int i
= start_idx
; i
< out
.size(); i
++)
552 std::cerr
<< "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex
<< out
[i
] << std::endl
;
557 void fix_exports(asm_context
& ctx
, std::vector
<uint32_t>& out
, Program
* program
)
559 for (int idx
= program
->blocks
.size() - 1; idx
>= 0; idx
--) {
560 Block
& block
= program
->blocks
[idx
];
561 std::vector
<aco_ptr
<Instruction
>>::reverse_iterator it
= block
.instructions
.rbegin();
562 bool endBlock
= false;
563 bool exported
= false;
564 while ( it
!= block
.instructions
.rend())
566 if ((*it
)->format
== Format::EXP
&& endBlock
) {
567 Export_instruction
* exp
= static_cast<Export_instruction
*>((*it
).get());
568 if (program
->stage
& hw_vs
) {
569 if (exp
->dest
>= V_008DFC_SQ_EXP_POS
&& exp
->dest
<= (V_008DFC_SQ_EXP_POS
+ 3)) {
576 exp
->valid_mask
= true;
580 } else if ((*it
)->definitions
.size() && (*it
)->definitions
[0].physReg() == exec
)
582 else if ((*it
)->opcode
== aco_opcode::s_endpgm
) {
589 if (!endBlock
|| exported
)
591 /* we didn't find an Export instruction and have to insert a null export */
592 aco_ptr
<Export_instruction
> exp
{create_instruction
<Export_instruction
>(aco_opcode::exp
, Format::EXP
, 4, 0)};
593 for (unsigned i
= 0; i
< 4; i
++)
594 exp
->operands
[i
] = Operand(v1
);
595 exp
->enabled_mask
= 0;
596 exp
->compressed
= false;
598 exp
->valid_mask
= program
->stage
& hw_fs
;
599 if (program
->stage
& hw_fs
)
600 exp
->dest
= 9; /* NULL */
602 exp
->dest
= V_008DFC_SQ_EXP_POS
;
603 /* insert the null export 1 instruction before endpgm */
604 block
.instructions
.insert(block
.instructions
.end() - 1, std::move(exp
));
608 void fix_branches(asm_context
& ctx
, std::vector
<uint32_t>& out
)
610 for (std::pair
<int, SOPP_instruction
*> branch
: ctx
.branches
)
612 int offset
= (int)ctx
.program
->blocks
[branch
.second
->block
].offset
- branch
.first
- 1;
613 out
[branch
.first
] |= (uint16_t) offset
;
617 void fix_constaddrs(asm_context
& ctx
, std::vector
<uint32_t>& out
)
619 for (unsigned addr
: ctx
.constaddrs
)
620 out
[addr
] += out
.size() * 4u;
623 unsigned emit_program(Program
* program
,
624 std::vector
<uint32_t>& code
)
626 asm_context
ctx(program
);
628 if (program
->stage
& (hw_vs
| hw_fs
))
629 fix_exports(ctx
, code
, program
);
631 for (Block
& block
: program
->blocks
) {
632 block
.offset
= code
.size();
633 emit_block(ctx
, code
, block
);
636 fix_branches(ctx
, code
);
637 fix_constaddrs(ctx
, code
);
639 unsigned constant_data_offset
= code
.size() * sizeof(uint32_t);
640 while (program
->constant_data
.size() % 4u)
641 program
->constant_data
.push_back(0);
642 /* Copy constant data */
643 code
.insert(code
.end(), (uint32_t*)program
->constant_data
.data(),
644 (uint32_t*)(program
->constant_data
.data() + program
->constant_data
.size()));
646 return constant_data_offset
;