aco: Initial commit of independent AMD compiler
[mesa.git] / src / amd / compiler / aco_lower_to_hw_instr.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 *
26 */
27
28 #include <map>
29
30 #include "aco_ir.h"
31 #include "aco_builder.h"
32 #include "util/u_math.h"
33 #include "sid.h"
34
35
36 namespace aco {
37
38 struct lower_context {
39 Program *program;
40 std::vector<aco_ptr<Instruction>> instructions;
41 };
42
43 void emit_dpp_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, PhysReg vtmp, PhysReg wrtmp,
44 aco_opcode op, Format format, bool clobber_vcc, unsigned dpp_ctrl,
45 unsigned row_mask, unsigned bank_mask, bool bound_ctrl_zero, unsigned size,
46 Operand *identity=NULL) /* for VOP3 with sparse writes */
47 {
48 RegClass rc = RegClass(RegType::vgpr, size);
49 if (format == Format::VOP3) {
50 Builder bld(ctx->program, &ctx->instructions);
51
52 if (identity)
53 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
54 if (identity && size >= 2)
55 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]);
56
57 for (unsigned i = 0; i < size; i++)
58 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0+i}, v1),
59 dpp_ctrl, row_mask, bank_mask, bound_ctrl_zero);
60
61 if (clobber_vcc)
62 bld.vop3(op, Definition(dst, rc), Definition(vcc, s2), Operand(vtmp, rc), Operand(src1, rc));
63 else
64 bld.vop3(op, Definition(dst, rc), Operand(vtmp, rc), Operand(src1, rc));
65 } else {
66 assert(format == Format::VOP2 || format == Format::VOP1);
67 assert(size == 1 || (op == aco_opcode::v_mov_b32));
68
69 for (unsigned i = 0; i < size; i++) {
70 aco_ptr<DPP_instruction> dpp{create_instruction<DPP_instruction>(
71 op, (Format) ((uint32_t) format | (uint32_t) Format::DPP),
72 format == Format::VOP2 ? 2 : 1, clobber_vcc ? 2 : 1)};
73 dpp->operands[0] = Operand(PhysReg{src0+i}, rc);
74 if (format == Format::VOP2)
75 dpp->operands[1] = Operand(PhysReg{src1+i}, rc);
76 dpp->definitions[0] = Definition(PhysReg{dst+i}, rc);
77 if (clobber_vcc)
78 dpp->definitions[1] = Definition(vcc, s2);
79 dpp->dpp_ctrl = dpp_ctrl;
80 dpp->row_mask = row_mask;
81 dpp->bank_mask = bank_mask;
82 dpp->bound_ctrl = bound_ctrl_zero;
83 ctx->instructions.emplace_back(std::move(dpp));
84 }
85 }
86 }
87
88 uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
89 {
90 switch (op) {
91 case iadd32:
92 case iadd64:
93 case fadd32:
94 case fadd64:
95 case ior32:
96 case ior64:
97 case ixor32:
98 case ixor64:
99 case umax32:
100 case umax64:
101 return 0;
102 case imul32:
103 case imul64:
104 return idx ? 0 : 1;
105 case fmul32:
106 return 0x3f800000u; /* 1.0 */
107 case fmul64:
108 return idx ? 0x3ff00000u : 0u; /* 1.0 */
109 case imin32:
110 return INT32_MAX;
111 case imin64:
112 return idx ? 0x7fffffffu : 0xffffffffu;
113 case imax32:
114 return INT32_MIN;
115 case imax64:
116 return idx ? 0x80000000u : 0;
117 case umin32:
118 case umin64:
119 case iand32:
120 case iand64:
121 return 0xffffffffu;
122 case fmin32:
123 return 0x7f800000u; /* infinity */
124 case fmin64:
125 return idx ? 0x7ff00000u : 0u; /* infinity */
126 case fmax32:
127 return 0xff800000u; /* negative infinity */
128 case fmax64:
129 return idx ? 0xfff00000u : 0u; /* negative infinity */
130 }
131 unreachable("Invalid reduction operation");
132 }
133
134 aco_opcode get_reduction_opcode(lower_context *ctx, ReduceOp op, bool *clobber_vcc, Format *format)
135 {
136 *clobber_vcc = false;
137 *format = Format::VOP2;
138 switch (op) {
139 case iadd32:
140 *clobber_vcc = ctx->program->chip_class < GFX9;
141 return ctx->program->chip_class < GFX9 ? aco_opcode::v_add_co_u32 : aco_opcode::v_add_u32;
142 case imul32:
143 *format = Format::VOP3;
144 return aco_opcode::v_mul_lo_u32;
145 case fadd32:
146 return aco_opcode::v_add_f32;
147 case fmul32:
148 return aco_opcode::v_mul_f32;
149 case imax32:
150 return aco_opcode::v_max_i32;
151 case imin32:
152 return aco_opcode::v_min_i32;
153 case umin32:
154 return aco_opcode::v_min_u32;
155 case umax32:
156 return aco_opcode::v_max_u32;
157 case fmin32:
158 return aco_opcode::v_min_f32;
159 case fmax32:
160 return aco_opcode::v_max_f32;
161 case iand32:
162 return aco_opcode::v_and_b32;
163 case ixor32:
164 return aco_opcode::v_xor_b32;
165 case ior32:
166 return aco_opcode::v_or_b32;
167 case iadd64:
168 case imul64:
169 assert(false);
170 break;
171 case fadd64:
172 *format = Format::VOP3;
173 return aco_opcode::v_add_f64;
174 case fmul64:
175 *format = Format::VOP3;
176 return aco_opcode::v_mul_f64;
177 case imin64:
178 case imax64:
179 case umin64:
180 case umax64:
181 assert(false);
182 break;
183 case fmin64:
184 *format = Format::VOP3;
185 return aco_opcode::v_min_f64;
186 case fmax64:
187 *format = Format::VOP3;
188 return aco_opcode::v_max_f64;
189 case iand64:
190 case ior64:
191 case ixor64:
192 assert(false);
193 break;
194 }
195 unreachable("Invalid reduction operation");
196 return aco_opcode::v_min_u32;
197 }
198
199 void emit_vopn(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1,
200 RegClass rc, aco_opcode op, Format format, bool clobber_vcc)
201 {
202 aco_ptr<Instruction> instr;
203 switch (format) {
204 case Format::VOP2:
205 instr.reset(create_instruction<VOP2_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
206 break;
207 case Format::VOP3:
208 instr.reset(create_instruction<VOP3A_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
209 break;
210 default:
211 assert(false);
212 }
213 instr->operands[0] = Operand(src0, rc);
214 instr->operands[1] = Operand(src1, rc);
215 instr->definitions[0] = Definition(dst, rc);
216 if (clobber_vcc)
217 instr->definitions[1] = Definition(vcc, s2);
218 ctx->instructions.emplace_back(std::move(instr));
219 }
220
221 void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp,
222 PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
223 {
224 assert(cluster_size == 64 || op == aco_opcode::p_reduce);
225
226 Builder bld(ctx->program, &ctx->instructions);
227
228 PhysReg wrtmp{0}; /* should never be needed */
229
230 Format format;
231 bool should_clobber_vcc;
232 aco_opcode reduce_opcode = get_reduction_opcode(ctx, reduce_op, &should_clobber_vcc, &format);
233 Operand identity[2];
234 identity[0] = Operand(get_reduction_identity(reduce_op, 0));
235 identity[1] = Operand(get_reduction_identity(reduce_op, 1));
236 Operand vcndmask_identity[2] = {identity[0], identity[1]};
237
238 /* First, copy the source to tmp and set inactive lanes to the identity */
239 // note: this clobbers SCC!
240 bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2));
241
242 for (unsigned i = 0; i < src.size(); i++) {
243 /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 */
244 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
245 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
246 identity[i] = Operand(PhysReg{sitmp+i}, s1);
247
248 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
249 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
250 } else if (identity[i].isLiteral()) {
251 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
252 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
253 }
254 }
255
256 for (unsigned i = 0; i < src.size(); i++) {
257 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
258 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
259 Operand(stmp, s2));
260 }
261
262 bool exec_restored = false;
263 bool dst_written = false;
264 switch (op) {
265 case aco_opcode::p_reduce:
266 if (cluster_size == 1) break;
267 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
268 dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false, src.size());
269 if (cluster_size == 2) break;
270 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
271 dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false, src.size());
272 if (cluster_size == 4) break;
273 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
274 dpp_row_half_mirror, 0xf, 0xf, false, src.size());
275 if (cluster_size == 8) break;
276 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
277 dpp_row_mirror, 0xf, 0xf, false, src.size());
278 if (cluster_size == 16) break;
279 if (cluster_size == 32) {
280 for (unsigned i = 0; i < src.size(); i++)
281 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10));
282 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2));
283 exec_restored = true;
284 emit_vopn(ctx, dst.physReg(), vtmp, tmp, src.regClass(), reduce_opcode, format, should_clobber_vcc);
285 dst_written = true;
286 } else {
287 assert(cluster_size == 64);
288 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
289 dpp_row_bcast15, 0xa, 0xf, false, src.size());
290 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
291 dpp_row_bcast31, 0xc, 0xf, false, src.size());
292 }
293 break;
294 case aco_opcode::p_exclusive_scan:
295 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
296 dpp_wf_sr1, 0xf, 0xf, true, src.size());
297 for (unsigned i = 0; i < src.size(); i++) {
298 if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
299 assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
300 bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
301 identity[i], Operand(0u));
302 }
303 }
304 /* fall through */
305 case aco_opcode::p_inclusive_scan:
306 assert(cluster_size == 64);
307 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
308 dpp_row_sr(1), 0xf, 0xf, false, src.size(), identity);
309 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
310 dpp_row_sr(2), 0xf, 0xf, false, src.size(), identity);
311 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
312 dpp_row_sr(4), 0xf, 0xf, false, src.size(), identity);
313 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
314 dpp_row_sr(8), 0xf, 0xf, false, src.size(), identity);
315 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
316 dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
317 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
318 dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
319 break;
320 default:
321 unreachable("Invalid reduction mode");
322 }
323
324 if (!exec_restored)
325 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2));
326
327 if (op == aco_opcode::p_reduce && cluster_size == 64) {
328 for (unsigned k = 0; k < src.size(); k++) {
329 bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1),
330 Operand(PhysReg{tmp + k}, v1), Operand(63u));
331 }
332 } else if (!(dst.physReg() == tmp) && !dst_written) {
333 for (unsigned k = 0; k < src.size(); k++) {
334 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, s1),
335 Operand(PhysReg{tmp + k}, v1));
336 }
337 }
338 }
339
340 struct copy_operation {
341 Operand op;
342 Definition def;
343 unsigned uses;
344 unsigned size;
345 };
346
347 void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi)
348 {
349 Builder bld(ctx->program, &ctx->instructions);
350 aco_ptr<Instruction> mov;
351 std::map<PhysReg, copy_operation>::iterator it = copy_map.begin();
352 std::map<PhysReg, copy_operation>::iterator target;
353 bool writes_scc = false;
354
355 /* count the number of uses for each dst reg */
356 while (it != copy_map.end()) {
357 if (it->second.op.isConstant()) {
358 ++it;
359 continue;
360 }
361
362 if (it->second.def.physReg() == scc)
363 writes_scc = true;
364
365 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
366
367 /* if src and dst reg are the same, remove operation */
368 if (it->first == it->second.op.physReg()) {
369 it = copy_map.erase(it);
370 continue;
371 }
372 /* check if the operand reg may be overwritten by another copy operation */
373 target = copy_map.find(it->second.op.physReg());
374 if (target != copy_map.end()) {
375 target->second.uses++;
376 }
377
378 ++it;
379 }
380
381 /* first, handle paths in the location transfer graph */
382 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
383 it = copy_map.begin();
384 while (it != copy_map.end()) {
385
386 /* the target reg is not used as operand for any other copy */
387 if (it->second.uses == 0) {
388
389 /* try to coalesce 32-bit sgpr copies to 64-bit copies */
390 if (it->second.def.getTemp().type() == RegType::sgpr && it->second.size == 1 &&
391 !it->second.op.isConstant() && it->first % 2 == it->second.op.physReg() % 2) {
392
393 PhysReg other_def_reg = PhysReg{it->first % 2 ? it->first - 1 : it->first + 1};
394 PhysReg other_op_reg = PhysReg{it->first % 2 ? it->second.op.physReg() - 1 : it->second.op.physReg() + 1};
395 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(other_def_reg);
396
397 if (other != copy_map.end() && !other->second.uses && other->second.size == 1 &&
398 other->second.op.physReg() == other_op_reg && !other->second.op.isConstant()) {
399 std::map<PhysReg, copy_operation>::iterator to_erase = it->first % 2 ? it : other;
400 it = it->first % 2 ? other : it;
401 copy_map.erase(to_erase);
402 it->second.size = 2;
403 }
404 }
405
406 if (it->second.def.physReg() == scc) {
407 bld.sopc(aco_opcode::s_cmp_lg_i32, it->second.def, it->second.op, Operand(0u));
408 preserve_scc = true;
409 } else if (it->second.size == 2 && it->second.def.getTemp().type() == RegType::sgpr) {
410 bld.sop1(aco_opcode::s_mov_b64, it->second.def, Operand(it->second.op.physReg(), s2));
411 } else {
412 bld.copy(it->second.def, it->second.op);
413 }
414
415 /* reduce the number of uses of the operand reg by one */
416 if (!it->second.op.isConstant()) {
417 for (unsigned i = 0; i < it->second.size; i++) {
418 target = copy_map.find(PhysReg{it->second.op.physReg() + i});
419 if (target != copy_map.end())
420 target->second.uses--;
421 }
422 }
423
424 copy_map.erase(it);
425 it = copy_map.begin();
426 continue;
427 } else {
428 /* the target reg is used as operand, check the next entry */
429 ++it;
430 }
431 }
432
433 if (copy_map.empty())
434 return;
435
436 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
437 bool constants = false;
438 for (it = copy_map.begin(); it != copy_map.end(); ++it) {
439 assert(it->second.op.isFixed());
440 if (it->first == it->second.op.physReg())
441 continue;
442 /* do constants later */
443 if (it->second.op.isConstant()) {
444 constants = true;
445 continue;
446 }
447
448 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
449 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
450
451 /* to resolve the cycle, we have to swap the src reg with the dst reg */
452 copy_operation swap = it->second;
453 assert(swap.op.regClass() == swap.def.regClass());
454 Operand def_as_op = Operand(swap.def.physReg(), swap.def.regClass());
455 Definition op_as_def = Definition(swap.op.physReg(), swap.op.regClass());
456 if (chip_class >= GFX9 && swap.def.getTemp().type() == RegType::vgpr) {
457 bld.vop1(aco_opcode::v_swap_b32, swap.def, op_as_def, swap.op, def_as_op);
458 } else if (swap.op.physReg() == scc || swap.def.physReg() == scc) {
459 /* we need to swap scc and another sgpr */
460 assert(!preserve_scc);
461
462 PhysReg other = swap.op.physReg() == scc ? swap.def.physReg() : swap.op.physReg();
463
464 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
465 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u));
466 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
467 } else if (swap.def.getTemp().type() == RegType::sgpr) {
468 if (preserve_scc) {
469 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), swap.op);
470 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
471 bld.sop1(aco_opcode::s_mov_b32, swap.def, Operand(pi->scratch_sgpr, s1));
472 } else {
473 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op);
474 bld.sop2(aco_opcode::s_xor_b32, swap.def, Definition(scc, s1), swap.op, def_as_op);
475 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op);
476 }
477 } else {
478 bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op);
479 bld.vop2(aco_opcode::v_xor_b32, swap.def, swap.op, def_as_op);
480 bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op);
481 }
482
483 /* change the operand reg of the target's use */
484 assert(swap.uses == 1);
485 target = it;
486 for (++target; target != copy_map.end(); ++target) {
487 if (target->second.op.physReg() == it->first) {
488 target->second.op.setFixed(swap.op.physReg());
489 break;
490 }
491 }
492 }
493
494 /* copy constants into a registers which were operands */
495 if (constants) {
496 for (it = copy_map.begin(); it != copy_map.end(); ++it) {
497 if (!it->second.op.isConstant())
498 continue;
499 if (it->second.def.physReg() == scc) {
500 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(0u), Operand(it->second.op.constantValue() ? 1u : 0u));
501 } else {
502 bld.copy(it->second.def, it->second.op);
503 }
504 }
505 }
506 }
507
508 void lower_to_hw_instr(Program* program)
509 {
510 Block *discard_block = NULL;
511
512 for (size_t i = 0; i < program->blocks.size(); i++)
513 {
514 Block *block = &program->blocks[i];
515 lower_context ctx;
516 ctx.program = program;
517 Builder bld(program, &ctx.instructions);
518
519 for (size_t j = 0; j < block->instructions.size(); j++) {
520 aco_ptr<Instruction>& instr = block->instructions[j];
521 aco_ptr<Instruction> mov;
522 if (instr->format == Format::PSEUDO) {
523 Pseudo_instruction *pi = (Pseudo_instruction*)instr.get();
524
525 switch (instr->opcode)
526 {
527 case aco_opcode::p_extract_vector:
528 {
529 unsigned reg = instr->operands[0].physReg() + instr->operands[1].constantValue() * instr->definitions[0].size();
530 RegClass rc = RegClass(instr->operands[0].getTemp().type(), 1);
531 RegClass rc_def = RegClass(instr->definitions[0].getTemp().type(), 1);
532 if (reg == instr->definitions[0].physReg())
533 break;
534
535 std::map<PhysReg, copy_operation> copy_operations;
536 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
537 Definition def = Definition(PhysReg{instr->definitions[0].physReg() + i}, rc_def);
538 copy_operations[def.physReg()] = {Operand(PhysReg{reg + i}, rc), def, 0, 1};
539 }
540 handle_operands(copy_operations, &ctx, program->chip_class, pi);
541 break;
542 }
543 case aco_opcode::p_create_vector:
544 {
545 std::map<PhysReg, copy_operation> copy_operations;
546 RegClass rc_def = RegClass(instr->definitions[0].getTemp().type(), 1);
547 unsigned reg_idx = 0;
548 for (const Operand& op : instr->operands) {
549 if (op.isConstant()) {
550 const PhysReg reg = PhysReg{instr->definitions[0].physReg() + reg_idx};
551 const Definition def = Definition(reg, rc_def);
552 copy_operations[reg] = {op, def, 0, 1};
553 reg_idx++;
554 continue;
555 }
556
557 RegClass rc_op = RegClass(op.getTemp().type(), 1);
558 for (unsigned j = 0; j < op.size(); j++)
559 {
560 const Operand copy_op = Operand(PhysReg{op.physReg() + j}, rc_op);
561 const Definition def = Definition(PhysReg{instr->definitions[0].physReg() + reg_idx}, rc_def);
562 copy_operations[def.physReg()] = {copy_op, def, 0, 1};
563 reg_idx++;
564 }
565 }
566 handle_operands(copy_operations, &ctx, program->chip_class, pi);
567 break;
568 }
569 case aco_opcode::p_split_vector:
570 {
571 std::map<PhysReg, copy_operation> copy_operations;
572 RegClass rc_op = instr->operands[0].isConstant() ? s1 : RegClass(instr->operands[0].regClass().type(), 1);
573 for (unsigned i = 0; i < instr->definitions.size(); i++) {
574 unsigned k = instr->definitions[i].size();
575 RegClass rc_def = RegClass(instr->definitions[i].getTemp().type(), 1);
576 for (unsigned j = 0; j < k; j++) {
577 Operand op = Operand(PhysReg{instr->operands[0].physReg() + (i*k+j)}, rc_op);
578 Definition def = Definition(PhysReg{instr->definitions[i].physReg() + j}, rc_def);
579 copy_operations[def.physReg()] = {op, def, 0, 1};
580 }
581 }
582 handle_operands(copy_operations, &ctx, program->chip_class, pi);
583 break;
584 }
585 case aco_opcode::p_parallelcopy:
586 case aco_opcode::p_wqm:
587 {
588 std::map<PhysReg, copy_operation> copy_operations;
589 for (unsigned i = 0; i < instr->operands.size(); i++)
590 {
591 Operand operand = instr->operands[i];
592 if (operand.isConstant() || operand.size() == 1) {
593 assert(instr->definitions[i].size() == 1);
594 copy_operations[instr->definitions[i].physReg()] = {operand, instr->definitions[i], 0, 1};
595 } else {
596 RegClass def_rc = RegClass(instr->definitions[i].regClass().type(), 1);
597 RegClass op_rc = RegClass(operand.getTemp().type(), 1);
598 for (unsigned j = 0; j < operand.size(); j++)
599 {
600 Operand op = Operand(PhysReg{instr->operands[i].physReg() + j}, op_rc);
601 Definition def = Definition(PhysReg{instr->definitions[i].physReg() + j}, def_rc);
602 copy_operations[def.physReg()] = {op, def, 0, 1};
603 }
604 }
605 }
606 handle_operands(copy_operations, &ctx, program->chip_class, pi);
607 break;
608 }
609 case aco_opcode::p_discard_if:
610 {
611 bool early_exit = false;
612 if (block->instructions[j + 1]->opcode != aco_opcode::p_logical_end ||
613 block->instructions[j + 2]->opcode != aco_opcode::s_endpgm) {
614 early_exit = true;
615 }
616
617 if (early_exit && !discard_block) {
618 discard_block = program->create_and_insert_block();
619 block = &program->blocks[i];
620
621 bld.reset(discard_block);
622 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
623 0, V_008DFC_SQ_EXP_NULL, false, true, true);
624 if (program->wb_smem_l1_on_end)
625 bld.smem(aco_opcode::s_dcache_wb);
626 bld.sopp(aco_opcode::s_endpgm);
627
628 bld.reset(&ctx.instructions);
629 }
630
631 // TODO: optimize uniform conditions
632 Definition branch_cond = instr->definitions.back();
633 Operand discard_cond = instr->operands.back();
634 aco_ptr<Instruction> sop2;
635 /* backwards, to finally branch on the global exec mask */
636 for (int i = instr->operands.size() - 2; i >= 0; i--) {
637 bld.sop2(aco_opcode::s_andn2_b64,
638 instr->definitions[i], /* new mask */
639 branch_cond, /* scc */
640 instr->operands[i], /* old mask */
641 discard_cond);
642 }
643
644 if (early_exit) {
645 bld.sopp(aco_opcode::s_cbranch_scc0, bld.scc(branch_cond.getTemp()), discard_block->index);
646
647 discard_block->linear_preds.push_back(block->index);
648 block->linear_succs.push_back(discard_block->index);
649 }
650
651 break;
652 }
653 case aco_opcode::p_spill:
654 {
655 assert(instr->operands[0].regClass() == v1.as_linear());
656 for (unsigned i = 0; i < instr->operands[2].size(); i++) {
657 bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1, instr->operands[0].physReg()),
658 Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
659 Operand(instr->operands[1].constantValue() + i));
660 }
661 break;
662 }
663 case aco_opcode::p_reload:
664 {
665 assert(instr->operands[0].regClass() == v1.as_linear());
666 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
667 bld.vop3(aco_opcode::v_readlane_b32,
668 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
669 instr->operands[0], Operand(instr->operands[1].constantValue() + i));
670 }
671 break;
672 }
673 case aco_opcode::p_as_uniform:
674 {
675 if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) {
676 std::map<PhysReg, copy_operation> copy_operations;
677 Operand operand = instr->operands[0];
678 if (operand.isConstant() || operand.size() == 1) {
679 assert(instr->definitions[0].size() == 1);
680 copy_operations[instr->definitions[0].physReg()] = {operand, instr->definitions[0], 0, 1};
681 } else {
682 for (unsigned i = 0; i < operand.size(); i++)
683 {
684 Operand op = Operand(PhysReg{operand.physReg() + i}, s1);
685 Definition def = Definition(PhysReg{instr->definitions[0].physReg() + i}, s1);
686 copy_operations[def.physReg()] = {op, def, 0, 1};
687 }
688 }
689
690 handle_operands(copy_operations, &ctx, program->chip_class, pi);
691 } else {
692 assert(instr->operands[0].regClass().type() == RegType::vgpr);
693 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
694 assert(instr->operands[0].size() == instr->definitions[0].size());
695 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
696 bld.vop1(aco_opcode::v_readfirstlane_b32,
697 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
698 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
699 }
700 }
701 break;
702 }
703 default:
704 break;
705 }
706 } else if (instr->format == Format::PSEUDO_BRANCH) {
707 Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(instr.get());
708 /* check if all blocks from current to target are empty */
709 bool can_remove = block->index < branch->target[0];
710 for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) {
711 if (program->blocks[i].instructions.size())
712 can_remove = false;
713 }
714 if (can_remove)
715 continue;
716
717 switch (instr->opcode) {
718 case aco_opcode::p_branch:
719 assert(block->linear_succs[0] == branch->target[0]);
720 bld.sopp(aco_opcode::s_branch, branch->target[0]);
721 break;
722 case aco_opcode::p_cbranch_nz:
723 assert(block->linear_succs[1] == branch->target[0]);
724 if (branch->operands[0].physReg() == exec)
725 bld.sopp(aco_opcode::s_cbranch_execnz, branch->target[0]);
726 else if (branch->operands[0].physReg() == vcc)
727 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->target[0]);
728 else {
729 assert(branch->operands[0].physReg() == scc);
730 bld.sopp(aco_opcode::s_cbranch_scc1, branch->target[0]);
731 }
732 break;
733 case aco_opcode::p_cbranch_z:
734 assert(block->linear_succs[1] == branch->target[0]);
735 if (branch->operands[0].physReg() == exec)
736 bld.sopp(aco_opcode::s_cbranch_execz, branch->target[0]);
737 else if (branch->operands[0].physReg() == vcc)
738 bld.sopp(aco_opcode::s_cbranch_vccz, branch->target[0]);
739 else {
740 assert(branch->operands[0].physReg() == scc);
741 bld.sopp(aco_opcode::s_cbranch_scc0, branch->target[0]);
742 }
743 break;
744 default:
745 unreachable("Unknown Pseudo branch instruction!");
746 }
747
748 } else if (instr->format == Format::PSEUDO_REDUCTION) {
749 Pseudo_reduction_instruction* reduce = static_cast<Pseudo_reduction_instruction*>(instr.get());
750 emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size,
751 reduce->operands[1].physReg(), // tmp
752 reduce->definitions[1].physReg(), // stmp
753 reduce->operands[2].physReg(), // vtmp
754 reduce->definitions[2].physReg(), // sitmp
755 reduce->operands[0], reduce->definitions[0]);
756 } else {
757 ctx.instructions.emplace_back(std::move(instr));
758 }
759
760 }
761 block->instructions.swap(ctx.instructions);
762 }
763 }
764
765 }