aco: Fix reductions on GFX10.
[mesa.git] / src / amd / compiler / aco_lower_to_hw_instr.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 *
26 */
27
28 #include <map>
29
30 #include "aco_ir.h"
31 #include "aco_builder.h"
32 #include "util/u_math.h"
33 #include "sid.h"
34
35
36 namespace aco {
37
38 struct lower_context {
39 Program *program;
40 std::vector<aco_ptr<Instruction>> instructions;
41 };
42
43 void emit_dpp_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, PhysReg vtmp, PhysReg wrtmp,
44 aco_opcode op, Format format, bool clobber_vcc, unsigned dpp_ctrl,
45 unsigned row_mask, unsigned bank_mask, bool bound_ctrl_zero, unsigned size,
46 Operand *identity=NULL) /* for VOP3 with sparse writes */
47 {
48 RegClass rc = RegClass(RegType::vgpr, size);
49 if (format == Format::VOP3) {
50 Builder bld(ctx->program, &ctx->instructions);
51
52 if (identity)
53 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
54 if (identity && size >= 2)
55 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]);
56
57 for (unsigned i = 0; i < size; i++)
58 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0+i}, v1),
59 dpp_ctrl, row_mask, bank_mask, bound_ctrl_zero);
60
61 if (clobber_vcc)
62 bld.vop3(op, Definition(dst, rc), Definition(vcc, s2), Operand(vtmp, rc), Operand(src1, rc));
63 else
64 bld.vop3(op, Definition(dst, rc), Operand(vtmp, rc), Operand(src1, rc));
65 } else {
66 assert(format == Format::VOP2 || format == Format::VOP1);
67 assert(size == 1 || (op == aco_opcode::v_mov_b32));
68
69 for (unsigned i = 0; i < size; i++) {
70 aco_ptr<DPP_instruction> dpp{create_instruction<DPP_instruction>(
71 op, (Format) ((uint32_t) format | (uint32_t) Format::DPP),
72 format == Format::VOP2 ? 2 : 1, clobber_vcc ? 2 : 1)};
73 dpp->operands[0] = Operand(PhysReg{src0+i}, rc);
74 if (format == Format::VOP2)
75 dpp->operands[1] = Operand(PhysReg{src1+i}, rc);
76 dpp->definitions[0] = Definition(PhysReg{dst+i}, rc);
77 if (clobber_vcc)
78 dpp->definitions[1] = Definition(vcc, s2);
79 dpp->dpp_ctrl = dpp_ctrl;
80 dpp->row_mask = row_mask;
81 dpp->bank_mask = bank_mask;
82 dpp->bound_ctrl = bound_ctrl_zero;
83 ctx->instructions.emplace_back(std::move(dpp));
84 }
85 }
86 }
87
88 void emit_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1,
89 aco_opcode op, Format format, bool clobber_vcc, unsigned size)
90 {
91 aco_ptr<Instruction> instr;
92 if (format == Format::VOP3)
93 instr.reset(create_instruction<VOP3A_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
94 else
95 instr.reset(create_instruction<VOP2_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
96 instr->operands[0] = Operand(src0, src0.reg >= 256 ? v1 : s1);
97 instr->operands[1] = Operand(src1, v1);
98 instr->definitions[0] = Definition(dst, v1);
99 if (clobber_vcc)
100 instr->definitions[1] = Definition(vcc, s2);
101 ctx->instructions.emplace_back(std::move(instr));
102 }
103
104 uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
105 {
106 switch (op) {
107 case iadd32:
108 case iadd64:
109 case fadd32:
110 case fadd64:
111 case ior32:
112 case ior64:
113 case ixor32:
114 case ixor64:
115 case umax32:
116 case umax64:
117 return 0;
118 case imul32:
119 case imul64:
120 return idx ? 0 : 1;
121 case fmul32:
122 return 0x3f800000u; /* 1.0 */
123 case fmul64:
124 return idx ? 0x3ff00000u : 0u; /* 1.0 */
125 case imin32:
126 return INT32_MAX;
127 case imin64:
128 return idx ? 0x7fffffffu : 0xffffffffu;
129 case imax32:
130 return INT32_MIN;
131 case imax64:
132 return idx ? 0x80000000u : 0;
133 case umin32:
134 case umin64:
135 case iand32:
136 case iand64:
137 return 0xffffffffu;
138 case fmin32:
139 return 0x7f800000u; /* infinity */
140 case fmin64:
141 return idx ? 0x7ff00000u : 0u; /* infinity */
142 case fmax32:
143 return 0xff800000u; /* negative infinity */
144 case fmax64:
145 return idx ? 0xfff00000u : 0u; /* negative infinity */
146 }
147 unreachable("Invalid reduction operation");
148 }
149
150 aco_opcode get_reduction_opcode(lower_context *ctx, ReduceOp op, bool *clobber_vcc, Format *format)
151 {
152 *clobber_vcc = false;
153 *format = Format::VOP2;
154 switch (op) {
155 case iadd32:
156 *clobber_vcc = ctx->program->chip_class < GFX9;
157 return ctx->program->chip_class < GFX9 ? aco_opcode::v_add_co_u32 : aco_opcode::v_add_u32;
158 case imul32:
159 *format = Format::VOP3;
160 return aco_opcode::v_mul_lo_u32;
161 case fadd32:
162 return aco_opcode::v_add_f32;
163 case fmul32:
164 return aco_opcode::v_mul_f32;
165 case imax32:
166 return aco_opcode::v_max_i32;
167 case imin32:
168 return aco_opcode::v_min_i32;
169 case umin32:
170 return aco_opcode::v_min_u32;
171 case umax32:
172 return aco_opcode::v_max_u32;
173 case fmin32:
174 return aco_opcode::v_min_f32;
175 case fmax32:
176 return aco_opcode::v_max_f32;
177 case iand32:
178 return aco_opcode::v_and_b32;
179 case ixor32:
180 return aco_opcode::v_xor_b32;
181 case ior32:
182 return aco_opcode::v_or_b32;
183 case iadd64:
184 case imul64:
185 assert(false);
186 break;
187 case fadd64:
188 *format = Format::VOP3;
189 return aco_opcode::v_add_f64;
190 case fmul64:
191 *format = Format::VOP3;
192 return aco_opcode::v_mul_f64;
193 case imin64:
194 case imax64:
195 case umin64:
196 case umax64:
197 assert(false);
198 break;
199 case fmin64:
200 *format = Format::VOP3;
201 return aco_opcode::v_min_f64;
202 case fmax64:
203 *format = Format::VOP3;
204 return aco_opcode::v_max_f64;
205 case iand64:
206 case ior64:
207 case ixor64:
208 assert(false);
209 break;
210 }
211 unreachable("Invalid reduction operation");
212 return aco_opcode::v_min_u32;
213 }
214
215 void emit_vopn(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1,
216 RegClass rc, aco_opcode op, Format format, bool clobber_vcc)
217 {
218 aco_ptr<Instruction> instr;
219 switch (format) {
220 case Format::VOP2:
221 instr.reset(create_instruction<VOP2_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
222 break;
223 case Format::VOP3:
224 instr.reset(create_instruction<VOP3A_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
225 break;
226 default:
227 assert(false);
228 }
229 instr->operands[0] = Operand(src0, rc);
230 instr->operands[1] = Operand(src1, rc);
231 instr->definitions[0] = Definition(dst, rc);
232 if (clobber_vcc)
233 instr->definitions[1] = Definition(vcc, s2);
234 ctx->instructions.emplace_back(std::move(instr));
235 }
236
237 void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp,
238 PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
239 {
240 assert(cluster_size == 64 || op == aco_opcode::p_reduce);
241
242 Builder bld(ctx->program, &ctx->instructions);
243
244 PhysReg wrtmp{0}; /* should never be needed */
245
246 Format format;
247 bool should_clobber_vcc;
248 aco_opcode reduce_opcode = get_reduction_opcode(ctx, reduce_op, &should_clobber_vcc, &format);
249 Operand identity[2];
250 identity[0] = Operand(get_reduction_identity(reduce_op, 0));
251 identity[1] = Operand(get_reduction_identity(reduce_op, 1));
252 Operand vcndmask_identity[2] = {identity[0], identity[1]};
253
254 /* First, copy the source to tmp and set inactive lanes to the identity */
255 bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2));
256
257 for (unsigned i = 0; i < src.size(); i++) {
258 /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
259 * except on GFX10, where v_writelane_b32 can take a literal. */
260 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
261 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
262 identity[i] = Operand(PhysReg{sitmp+i}, s1);
263
264 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
265 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
266 } else if (identity[i].isLiteral()) {
267 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
268 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
269 }
270 }
271
272 for (unsigned i = 0; i < src.size(); i++) {
273 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
274 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
275 Operand(stmp, s2));
276 }
277
278 bool exec_restored = false;
279 bool dst_written = false;
280 switch (op) {
281 case aco_opcode::p_reduce:
282 if (cluster_size == 1) break;
283 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
284 dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false, src.size());
285 if (cluster_size == 2) break;
286 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
287 dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false, src.size());
288 if (cluster_size == 4) break;
289 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
290 dpp_row_half_mirror, 0xf, 0xf, false, src.size());
291 if (cluster_size == 8) break;
292 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
293 dpp_row_mirror, 0xf, 0xf, false, src.size());
294 if (cluster_size == 16) break;
295 if (cluster_size == 32) {
296 for (unsigned i = 0; i < src.size(); i++)
297 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10));
298 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2));
299 exec_restored = true;
300 emit_vopn(ctx, dst.physReg(), vtmp, tmp, src.regClass(), reduce_opcode, format, should_clobber_vcc);
301 dst_written = true;
302 } else if (ctx->program->chip_class >= GFX10) {
303 assert(cluster_size == 64);
304 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
305 for (unsigned i = 0; i < src.size(); i++)
306 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
307 emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
308
309 for (unsigned i = 0; i < src.size(); i++)
310 bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
311 emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
312 } else {
313 assert(cluster_size == 64);
314 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
315 dpp_row_bcast15, 0xa, 0xf, false, src.size());
316 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
317 dpp_row_bcast31, 0xc, 0xf, false, src.size());
318 }
319 break;
320 case aco_opcode::p_exclusive_scan:
321 if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
322 /* shift rows right */
323 for (unsigned i = 0; i < src.size(); i++) {
324 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), dpp_row_sr(1), 0xf, 0xf, true);
325 }
326
327 /* fill in the gaps in rows 1 and 3 */
328 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
329 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
330 for (unsigned i = 0; i < src.size(); i++) {
331 Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
332 Definition(PhysReg{vtmp+i}, v1),
333 Operand(PhysReg{tmp+i}, v1),
334 Operand(0xffffffffu), Operand(0xffffffffu)).instr;
335 static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
336 }
337 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
338
339 /* fill in the gap in row 2 */
340 for (unsigned i = 0; i < src.size(); i++) {
341 bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
342 bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
343 }
344 std::swap(tmp, vtmp);
345 } else {
346 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
347 dpp_wf_sr1, 0xf, 0xf, true, src.size());
348 }
349 for (unsigned i = 0; i < src.size(); i++) {
350 if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
351 if (ctx->program->chip_class < GFX10)
352 assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
353 bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
354 identity[i], Operand(0u));
355 }
356 }
357 /* fall through */
358 case aco_opcode::p_inclusive_scan:
359 assert(cluster_size == 64);
360 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
361 dpp_row_sr(1), 0xf, 0xf, false, src.size(), identity);
362 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
363 dpp_row_sr(2), 0xf, 0xf, false, src.size(), identity);
364 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
365 dpp_row_sr(4), 0xf, 0xf, false, src.size(), identity);
366 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
367 dpp_row_sr(8), 0xf, 0xf, false, src.size(), identity);
368 if (ctx->program->chip_class >= GFX10) {
369 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xffff0000u));
370 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffff0000u));
371 for (unsigned i = 0; i < src.size(); i++) {
372 Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
373 Definition(PhysReg{vtmp+i}, v1),
374 Operand(PhysReg{tmp+i}, v1),
375 Operand(0xffffffffu), Operand(0xffffffffu)).instr;
376 static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
377 }
378 emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
379
380 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
381 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
382 for (unsigned i = 0; i < src.size(); i++)
383 bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
384 emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
385 } else {
386 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
387 dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
388 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
389 dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
390 }
391 break;
392 default:
393 unreachable("Invalid reduction mode");
394 }
395
396 if (!exec_restored)
397 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2));
398
399 if (op == aco_opcode::p_reduce && cluster_size == 64) {
400 for (unsigned k = 0; k < src.size(); k++) {
401 bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1),
402 Operand(PhysReg{tmp + k}, v1), Operand(63u));
403 }
404 } else if (!(dst.physReg() == tmp) && !dst_written) {
405 for (unsigned k = 0; k < src.size(); k++) {
406 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, s1),
407 Operand(PhysReg{tmp + k}, v1));
408 }
409 }
410 }
411
412 struct copy_operation {
413 Operand op;
414 Definition def;
415 unsigned uses;
416 unsigned size;
417 };
418
419 void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi)
420 {
421 Builder bld(ctx->program, &ctx->instructions);
422 aco_ptr<Instruction> mov;
423 std::map<PhysReg, copy_operation>::iterator it = copy_map.begin();
424 std::map<PhysReg, copy_operation>::iterator target;
425 bool writes_scc = false;
426
427 /* count the number of uses for each dst reg */
428 while (it != copy_map.end()) {
429 if (it->second.op.isConstant()) {
430 ++it;
431 continue;
432 }
433
434 if (it->second.def.physReg() == scc)
435 writes_scc = true;
436
437 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
438
439 /* if src and dst reg are the same, remove operation */
440 if (it->first == it->second.op.physReg()) {
441 it = copy_map.erase(it);
442 continue;
443 }
444 /* check if the operand reg may be overwritten by another copy operation */
445 target = copy_map.find(it->second.op.physReg());
446 if (target != copy_map.end()) {
447 target->second.uses++;
448 }
449
450 ++it;
451 }
452
453 /* first, handle paths in the location transfer graph */
454 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
455 it = copy_map.begin();
456 while (it != copy_map.end()) {
457
458 /* the target reg is not used as operand for any other copy */
459 if (it->second.uses == 0) {
460
461 /* try to coalesce 32-bit sgpr copies to 64-bit copies */
462 if (it->second.def.getTemp().type() == RegType::sgpr && it->second.size == 1 &&
463 !it->second.op.isConstant() && it->first % 2 == it->second.op.physReg() % 2) {
464
465 PhysReg other_def_reg = PhysReg{it->first % 2 ? it->first - 1 : it->first + 1};
466 PhysReg other_op_reg = PhysReg{it->first % 2 ? it->second.op.physReg() - 1 : it->second.op.physReg() + 1};
467 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(other_def_reg);
468
469 if (other != copy_map.end() && !other->second.uses && other->second.size == 1 &&
470 other->second.op.physReg() == other_op_reg && !other->second.op.isConstant()) {
471 std::map<PhysReg, copy_operation>::iterator to_erase = it->first % 2 ? it : other;
472 it = it->first % 2 ? other : it;
473 copy_map.erase(to_erase);
474 it->second.size = 2;
475 }
476 }
477
478 if (it->second.def.physReg() == scc) {
479 bld.sopc(aco_opcode::s_cmp_lg_i32, it->second.def, it->second.op, Operand(0u));
480 preserve_scc = true;
481 } else if (it->second.size == 2 && it->second.def.getTemp().type() == RegType::sgpr) {
482 bld.sop1(aco_opcode::s_mov_b64, it->second.def, Operand(it->second.op.physReg(), s2));
483 } else {
484 bld.copy(it->second.def, it->second.op);
485 }
486
487 /* reduce the number of uses of the operand reg by one */
488 if (!it->second.op.isConstant()) {
489 for (unsigned i = 0; i < it->second.size; i++) {
490 target = copy_map.find(PhysReg{it->second.op.physReg() + i});
491 if (target != copy_map.end())
492 target->second.uses--;
493 }
494 }
495
496 copy_map.erase(it);
497 it = copy_map.begin();
498 continue;
499 } else {
500 /* the target reg is used as operand, check the next entry */
501 ++it;
502 }
503 }
504
505 if (copy_map.empty())
506 return;
507
508 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
509 bool constants = false;
510 for (it = copy_map.begin(); it != copy_map.end(); ++it) {
511 assert(it->second.op.isFixed());
512 if (it->first == it->second.op.physReg())
513 continue;
514 /* do constants later */
515 if (it->second.op.isConstant()) {
516 constants = true;
517 continue;
518 }
519
520 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
521 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
522
523 /* to resolve the cycle, we have to swap the src reg with the dst reg */
524 copy_operation swap = it->second;
525 assert(swap.op.regClass() == swap.def.regClass());
526 Operand def_as_op = Operand(swap.def.physReg(), swap.def.regClass());
527 Definition op_as_def = Definition(swap.op.physReg(), swap.op.regClass());
528 if (chip_class >= GFX9 && swap.def.getTemp().type() == RegType::vgpr) {
529 bld.vop1(aco_opcode::v_swap_b32, swap.def, op_as_def, swap.op, def_as_op);
530 } else if (swap.op.physReg() == scc || swap.def.physReg() == scc) {
531 /* we need to swap scc and another sgpr */
532 assert(!preserve_scc);
533
534 PhysReg other = swap.op.physReg() == scc ? swap.def.physReg() : swap.op.physReg();
535
536 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
537 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u));
538 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
539 } else if (swap.def.getTemp().type() == RegType::sgpr) {
540 if (preserve_scc) {
541 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), swap.op);
542 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
543 bld.sop1(aco_opcode::s_mov_b32, swap.def, Operand(pi->scratch_sgpr, s1));
544 } else {
545 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op);
546 bld.sop2(aco_opcode::s_xor_b32, swap.def, Definition(scc, s1), swap.op, def_as_op);
547 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op);
548 }
549 } else {
550 bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op);
551 bld.vop2(aco_opcode::v_xor_b32, swap.def, swap.op, def_as_op);
552 bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op);
553 }
554
555 /* change the operand reg of the target's use */
556 assert(swap.uses == 1);
557 target = it;
558 for (++target; target != copy_map.end(); ++target) {
559 if (target->second.op.physReg() == it->first) {
560 target->second.op.setFixed(swap.op.physReg());
561 break;
562 }
563 }
564 }
565
566 /* copy constants into a registers which were operands */
567 if (constants) {
568 for (it = copy_map.begin(); it != copy_map.end(); ++it) {
569 if (!it->second.op.isConstant())
570 continue;
571 if (it->second.def.physReg() == scc) {
572 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(0u), Operand(it->second.op.constantValue() ? 1u : 0u));
573 } else {
574 bld.copy(it->second.def, it->second.op);
575 }
576 }
577 }
578 }
579
580 void lower_to_hw_instr(Program* program)
581 {
582 Block *discard_block = NULL;
583
584 for (size_t i = 0; i < program->blocks.size(); i++)
585 {
586 Block *block = &program->blocks[i];
587 lower_context ctx;
588 ctx.program = program;
589 Builder bld(program, &ctx.instructions);
590
591 for (size_t j = 0; j < block->instructions.size(); j++) {
592 aco_ptr<Instruction>& instr = block->instructions[j];
593 aco_ptr<Instruction> mov;
594 if (instr->format == Format::PSEUDO) {
595 Pseudo_instruction *pi = (Pseudo_instruction*)instr.get();
596
597 switch (instr->opcode)
598 {
599 case aco_opcode::p_extract_vector:
600 {
601 unsigned reg = instr->operands[0].physReg() + instr->operands[1].constantValue() * instr->definitions[0].size();
602 RegClass rc = RegClass(instr->operands[0].getTemp().type(), 1);
603 RegClass rc_def = RegClass(instr->definitions[0].getTemp().type(), 1);
604 if (reg == instr->definitions[0].physReg())
605 break;
606
607 std::map<PhysReg, copy_operation> copy_operations;
608 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
609 Definition def = Definition(PhysReg{instr->definitions[0].physReg() + i}, rc_def);
610 copy_operations[def.physReg()] = {Operand(PhysReg{reg + i}, rc), def, 0, 1};
611 }
612 handle_operands(copy_operations, &ctx, program->chip_class, pi);
613 break;
614 }
615 case aco_opcode::p_create_vector:
616 {
617 std::map<PhysReg, copy_operation> copy_operations;
618 RegClass rc_def = RegClass(instr->definitions[0].getTemp().type(), 1);
619 unsigned reg_idx = 0;
620 for (const Operand& op : instr->operands) {
621 if (op.isConstant()) {
622 const PhysReg reg = PhysReg{instr->definitions[0].physReg() + reg_idx};
623 const Definition def = Definition(reg, rc_def);
624 copy_operations[reg] = {op, def, 0, 1};
625 reg_idx++;
626 continue;
627 }
628
629 RegClass rc_op = RegClass(op.getTemp().type(), 1);
630 for (unsigned j = 0; j < op.size(); j++)
631 {
632 const Operand copy_op = Operand(PhysReg{op.physReg() + j}, rc_op);
633 const Definition def = Definition(PhysReg{instr->definitions[0].physReg() + reg_idx}, rc_def);
634 copy_operations[def.physReg()] = {copy_op, def, 0, 1};
635 reg_idx++;
636 }
637 }
638 handle_operands(copy_operations, &ctx, program->chip_class, pi);
639 break;
640 }
641 case aco_opcode::p_split_vector:
642 {
643 std::map<PhysReg, copy_operation> copy_operations;
644 RegClass rc_op = instr->operands[0].isConstant() ? s1 : RegClass(instr->operands[0].regClass().type(), 1);
645 for (unsigned i = 0; i < instr->definitions.size(); i++) {
646 unsigned k = instr->definitions[i].size();
647 RegClass rc_def = RegClass(instr->definitions[i].getTemp().type(), 1);
648 for (unsigned j = 0; j < k; j++) {
649 Operand op = Operand(PhysReg{instr->operands[0].physReg() + (i*k+j)}, rc_op);
650 Definition def = Definition(PhysReg{instr->definitions[i].physReg() + j}, rc_def);
651 copy_operations[def.physReg()] = {op, def, 0, 1};
652 }
653 }
654 handle_operands(copy_operations, &ctx, program->chip_class, pi);
655 break;
656 }
657 case aco_opcode::p_parallelcopy:
658 case aco_opcode::p_wqm:
659 {
660 std::map<PhysReg, copy_operation> copy_operations;
661 for (unsigned i = 0; i < instr->operands.size(); i++)
662 {
663 Operand operand = instr->operands[i];
664 if (operand.isConstant() || operand.size() == 1) {
665 assert(instr->definitions[i].size() == 1);
666 copy_operations[instr->definitions[i].physReg()] = {operand, instr->definitions[i], 0, 1};
667 } else {
668 RegClass def_rc = RegClass(instr->definitions[i].regClass().type(), 1);
669 RegClass op_rc = RegClass(operand.getTemp().type(), 1);
670 for (unsigned j = 0; j < operand.size(); j++)
671 {
672 Operand op = Operand(PhysReg{instr->operands[i].physReg() + j}, op_rc);
673 Definition def = Definition(PhysReg{instr->definitions[i].physReg() + j}, def_rc);
674 copy_operations[def.physReg()] = {op, def, 0, 1};
675 }
676 }
677 }
678 handle_operands(copy_operations, &ctx, program->chip_class, pi);
679 break;
680 }
681 case aco_opcode::p_exit_early_if:
682 {
683 /* don't bother with an early exit at the end of the program */
684 if (block->instructions[j + 1]->opcode == aco_opcode::p_logical_end &&
685 block->instructions[j + 2]->opcode == aco_opcode::s_endpgm) {
686 break;
687 }
688
689 if (!discard_block) {
690 discard_block = program->create_and_insert_block();
691 block = &program->blocks[i];
692
693 bld.reset(discard_block);
694 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
695 0, V_008DFC_SQ_EXP_NULL, false, true, true);
696 if (program->wb_smem_l1_on_end)
697 bld.smem(aco_opcode::s_dcache_wb);
698 bld.sopp(aco_opcode::s_endpgm);
699
700 bld.reset(&ctx.instructions);
701 }
702
703 //TODO: exec can be zero here with block_kind_discard
704
705 assert(instr->operands[0].physReg() == scc);
706 bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
707
708 discard_block->linear_preds.push_back(block->index);
709 block->linear_succs.push_back(discard_block->index);
710 break;
711 }
712 case aco_opcode::p_spill:
713 {
714 assert(instr->operands[0].regClass() == v1.as_linear());
715 for (unsigned i = 0; i < instr->operands[2].size(); i++) {
716 bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1, instr->operands[0].physReg()),
717 Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
718 Operand(instr->operands[1].constantValue() + i));
719 }
720 break;
721 }
722 case aco_opcode::p_reload:
723 {
724 assert(instr->operands[0].regClass() == v1.as_linear());
725 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
726 bld.vop3(aco_opcode::v_readlane_b32,
727 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
728 instr->operands[0], Operand(instr->operands[1].constantValue() + i));
729 }
730 break;
731 }
732 case aco_opcode::p_as_uniform:
733 {
734 if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) {
735 std::map<PhysReg, copy_operation> copy_operations;
736 Operand operand = instr->operands[0];
737 if (operand.isConstant() || operand.size() == 1) {
738 assert(instr->definitions[0].size() == 1);
739 copy_operations[instr->definitions[0].physReg()] = {operand, instr->definitions[0], 0, 1};
740 } else {
741 for (unsigned i = 0; i < operand.size(); i++)
742 {
743 Operand op = Operand(PhysReg{operand.physReg() + i}, s1);
744 Definition def = Definition(PhysReg{instr->definitions[0].physReg() + i}, s1);
745 copy_operations[def.physReg()] = {op, def, 0, 1};
746 }
747 }
748
749 handle_operands(copy_operations, &ctx, program->chip_class, pi);
750 } else {
751 assert(instr->operands[0].regClass().type() == RegType::vgpr);
752 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
753 assert(instr->operands[0].size() == instr->definitions[0].size());
754 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
755 bld.vop1(aco_opcode::v_readfirstlane_b32,
756 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
757 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
758 }
759 }
760 break;
761 }
762 default:
763 break;
764 }
765 } else if (instr->format == Format::PSEUDO_BRANCH) {
766 Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(instr.get());
767 /* check if all blocks from current to target are empty */
768 bool can_remove = block->index < branch->target[0];
769 for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) {
770 if (program->blocks[i].instructions.size())
771 can_remove = false;
772 }
773 if (can_remove)
774 continue;
775
776 switch (instr->opcode) {
777 case aco_opcode::p_branch:
778 assert(block->linear_succs[0] == branch->target[0]);
779 bld.sopp(aco_opcode::s_branch, branch->target[0]);
780 break;
781 case aco_opcode::p_cbranch_nz:
782 assert(block->linear_succs[1] == branch->target[0]);
783 if (branch->operands[0].physReg() == exec)
784 bld.sopp(aco_opcode::s_cbranch_execnz, branch->target[0]);
785 else if (branch->operands[0].physReg() == vcc)
786 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->target[0]);
787 else {
788 assert(branch->operands[0].physReg() == scc);
789 bld.sopp(aco_opcode::s_cbranch_scc1, branch->target[0]);
790 }
791 break;
792 case aco_opcode::p_cbranch_z:
793 assert(block->linear_succs[1] == branch->target[0]);
794 if (branch->operands[0].physReg() == exec)
795 bld.sopp(aco_opcode::s_cbranch_execz, branch->target[0]);
796 else if (branch->operands[0].physReg() == vcc)
797 bld.sopp(aco_opcode::s_cbranch_vccz, branch->target[0]);
798 else {
799 assert(branch->operands[0].physReg() == scc);
800 bld.sopp(aco_opcode::s_cbranch_scc0, branch->target[0]);
801 }
802 break;
803 default:
804 unreachable("Unknown Pseudo branch instruction!");
805 }
806
807 } else if (instr->format == Format::PSEUDO_REDUCTION) {
808 Pseudo_reduction_instruction* reduce = static_cast<Pseudo_reduction_instruction*>(instr.get());
809 emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size,
810 reduce->operands[1].physReg(), // tmp
811 reduce->definitions[1].physReg(), // stmp
812 reduce->operands[2].physReg(), // vtmp
813 reduce->definitions[2].physReg(), // sitmp
814 reduce->operands[0], reduce->definitions[0]);
815 } else {
816 ctx.instructions.emplace_back(std::move(instr));
817 }
818
819 }
820 block->instructions.swap(ctx.instructions);
821 }
822 }
823
824 }