aco: implement 8-bit/16-bit reductions
[mesa.git] / src / amd / compiler / aco_lower_to_hw_instr.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 *
26 */
27
28 #include <map>
29
30 #include "aco_ir.h"
31 #include "aco_builder.h"
32 #include "util/u_math.h"
33 #include "sid.h"
34 #include "vulkan/radv_shader.h"
35
36
37 namespace aco {
38
39 struct lower_context {
40 Program *program;
41 std::vector<aco_ptr<Instruction>> instructions;
42 };
43
44 aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
45 switch (op) {
46 case iadd8:
47 case iadd16: return aco_opcode::v_add_u16;
48 case imul8:
49 case imul16: return aco_opcode::v_mul_lo_u16;
50 case fadd16: return aco_opcode::v_add_f16;
51 case fmul16: return aco_opcode::v_mul_f16;
52 case imax8:
53 case imax16: return aco_opcode::v_max_i16;
54 case imin8:
55 case imin16: return aco_opcode::v_min_i16;
56 case umin8:
57 case umin16: return aco_opcode::v_min_u16;
58 case umax8:
59 case umax16: return aco_opcode::v_max_u16;
60 case fmin16: return aco_opcode::v_min_f16;
61 case fmax16: return aco_opcode::v_max_f16;
62 case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
63 case imul32: return aco_opcode::v_mul_lo_u32;
64 case fadd32: return aco_opcode::v_add_f32;
65 case fmul32: return aco_opcode::v_mul_f32;
66 case imax32: return aco_opcode::v_max_i32;
67 case imin32: return aco_opcode::v_min_i32;
68 case umin32: return aco_opcode::v_min_u32;
69 case umax32: return aco_opcode::v_max_u32;
70 case fmin32: return aco_opcode::v_min_f32;
71 case fmax32: return aco_opcode::v_max_f32;
72 case iand8:
73 case iand16:
74 case iand32: return aco_opcode::v_and_b32;
75 case ixor8:
76 case ixor16:
77 case ixor32: return aco_opcode::v_xor_b32;
78 case ior8:
79 case ior16:
80 case ior32: return aco_opcode::v_or_b32;
81 case iadd64: return aco_opcode::num_opcodes;
82 case imul64: return aco_opcode::num_opcodes;
83 case fadd64: return aco_opcode::v_add_f64;
84 case fmul64: return aco_opcode::v_mul_f64;
85 case imin64: return aco_opcode::num_opcodes;
86 case imax64: return aco_opcode::num_opcodes;
87 case umin64: return aco_opcode::num_opcodes;
88 case umax64: return aco_opcode::num_opcodes;
89 case fmin64: return aco_opcode::v_min_f64;
90 case fmax64: return aco_opcode::v_max_f64;
91 case iand64: return aco_opcode::num_opcodes;
92 case ior64: return aco_opcode::num_opcodes;
93 case ixor64: return aco_opcode::num_opcodes;
94 default: return aco_opcode::num_opcodes;
95 }
96 }
97
98 void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
99 {
100 Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
101 if (instr->definitions.size() >= 2) {
102 assert(instr->definitions[1].regClass() == bld.lm);
103 instr->definitions[1].setFixed(vcc);
104 }
105 }
106
107 void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
108 PhysReg vtmp_reg, ReduceOp op,
109 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl,
110 Operand *identity=NULL)
111 {
112 Builder bld(ctx->program, &ctx->instructions);
113 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
114 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg+1}, v1)};
115 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg+1}, v1)};
116 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
117 Operand src1_64 = Operand(src1_reg, v2);
118 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg+1}, v1)};
119 Operand vtmp_op64 = Operand(vtmp_reg, v2);
120 if (op == iadd64) {
121 if (ctx->program->chip_class >= GFX10) {
122 if (identity)
123 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
124 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
125 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
126 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
127 } else {
128 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
129 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
130 }
131 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm),
132 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
133 } else if (op == iand64) {
134 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0],
135 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
136 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1],
137 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
138 } else if (op == ior64) {
139 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0],
140 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
141 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1],
142 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
143 } else if (op == ixor64) {
144 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0],
145 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
146 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1],
147 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
148 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
149 aco_opcode cmp = aco_opcode::num_opcodes;
150 switch (op) {
151 case umin64:
152 cmp = aco_opcode::v_cmp_gt_u64;
153 break;
154 case umax64:
155 cmp = aco_opcode::v_cmp_lt_u64;
156 break;
157 case imin64:
158 cmp = aco_opcode::v_cmp_gt_i64;
159 break;
160 case imax64:
161 cmp = aco_opcode::v_cmp_lt_i64;
162 break;
163 default:
164 break;
165 }
166
167 if (identity) {
168 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
169 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
170 }
171 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
172 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
173 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1],
174 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
175
176 bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
177 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
178 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
179 } else if (op == imul64) {
180 /* t4 = dpp(x_hi)
181 * t1 = umul_lo(t4, y_lo)
182 * t3 = dpp(x_lo)
183 * t0 = umul_lo(t3, y_hi)
184 * t2 = iadd(t0, t1)
185 * t5 = umul_hi(t3, y_lo)
186 * res_hi = iadd(t2, t5)
187 * res_lo = umul_lo(t3, y_lo)
188 * Requires that res_hi != src0[0] and res_hi != src1[0]
189 * and that vtmp[0] != res_hi.
190 */
191 if (identity)
192 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
193 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1],
194 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
195 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
196 if (identity)
197 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
198 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
199 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
200 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
201 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
202 if (identity)
203 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
204 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
205 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
206 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
207 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
208 if (identity)
209 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
210 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
211 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
212 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
213 }
214 }
215
216 void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, ReduceOp op)
217 {
218 Builder bld(ctx->program, &ctx->instructions);
219 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
220 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
221 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg+1}, src0_rc)};
222 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
223 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
224 Operand src1_64 = Operand(src1_reg, v2);
225
226 if (src0_rc == s1 &&
227 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
228 assert(vtmp.reg() != 0);
229 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
230 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
231 src0_reg = vtmp;
232 src0[0] = Operand(vtmp, v1);
233 src0[1] = Operand(PhysReg{vtmp+1}, v1);
234 src0_64 = Operand(vtmp, v2);
235 } else if (src0_rc == s1 && op == iadd64) {
236 assert(vtmp.reg() != 0);
237 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
238 src0[1] = Operand(PhysReg{vtmp+1}, v1);
239 }
240
241 if (op == iadd64) {
242 if (ctx->program->chip_class >= GFX10) {
243 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
244 } else {
245 bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
246 }
247 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm));
248 } else if (op == iand64) {
249 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
250 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
251 } else if (op == ior64) {
252 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
253 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
254 } else if (op == ixor64) {
255 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
256 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
257 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
258 aco_opcode cmp = aco_opcode::num_opcodes;
259 switch (op) {
260 case umin64:
261 cmp = aco_opcode::v_cmp_gt_u64;
262 break;
263 case umax64:
264 cmp = aco_opcode::v_cmp_lt_u64;
265 break;
266 case imin64:
267 cmp = aco_opcode::v_cmp_gt_i64;
268 break;
269 case imax64:
270 cmp = aco_opcode::v_cmp_lt_i64;
271 break;
272 default:
273 break;
274 }
275
276 bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
277 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
278 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
279 } else if (op == imul64) {
280 if (src1_reg == dst_reg) {
281 /* it's fine if src0==dst but not if src1==dst */
282 std::swap(src0_reg, src1_reg);
283 std::swap(src0[0], src1[0]);
284 std::swap(src0[1], src1[1]);
285 std::swap(src0_64, src1_64);
286 }
287 assert(!(src0_reg == src1_reg));
288 /* t1 = umul_lo(x_hi, y_lo)
289 * t0 = umul_lo(x_lo, y_hi)
290 * t2 = iadd(t0, t1)
291 * t5 = umul_hi(x_lo, y_lo)
292 * res_hi = iadd(t2, t5)
293 * res_lo = umul_lo(x_lo, y_lo)
294 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
295 */
296 Definition tmp0_def(PhysReg{src0_reg+1}, v1);
297 Definition tmp1_def(PhysReg{src1_reg+1}, v1);
298 Operand tmp0_op = src0[1];
299 Operand tmp1_op = src1[1];
300 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
301 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
302 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
303 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
304 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
305 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
306 }
307 }
308
309 void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
310 PhysReg vtmp, ReduceOp op, unsigned size,
311 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl,
312 Operand *identity=NULL) /* for VOP3 with sparse writes */
313 {
314 Builder bld(ctx->program, &ctx->instructions);
315 RegClass rc = RegClass(RegType::vgpr, size);
316 Definition dst(dst_reg, rc);
317 Operand src0(src0_reg, rc);
318 Operand src1(src1_reg, rc);
319
320 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
321 bool vop3 = op == imul32 || size == 2;
322
323 if (!vop3) {
324 if (opcode == aco_opcode::v_add_co_u32)
325 bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
326 else
327 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
328 return;
329 }
330
331 if (opcode == aco_opcode::num_opcodes) {
332 emit_int64_dpp_op(ctx, dst_reg ,src0_reg, src1_reg, vtmp, op,
333 dpp_ctrl, row_mask, bank_mask, bound_ctrl, identity);
334 return;
335 }
336
337 if (identity)
338 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
339 if (identity && size >= 2)
340 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]);
341
342 for (unsigned i = 0; i < size; i++)
343 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0_reg+i}, v1),
344 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
345
346 bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
347 }
348
349 void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
350 PhysReg vtmp, ReduceOp op, unsigned size)
351 {
352 Builder bld(ctx->program, &ctx->instructions);
353 RegClass rc = RegClass(RegType::vgpr, size);
354 Definition dst(dst_reg, rc);
355 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
356 Operand src1(src1_reg, rc);
357
358 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
359 bool vop3 = op == imul32 || size == 2;
360
361 if (opcode == aco_opcode::num_opcodes) {
362 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
363 return;
364 }
365
366 if (vop3) {
367 bld.vop3(opcode, dst, src0, src1);
368 } else if (opcode == aco_opcode::v_add_co_u32) {
369 bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
370 } else {
371 bld.vop2(opcode, dst, src0, src1);
372 }
373 }
374
375 void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size,
376 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
377 {
378 Builder bld(ctx->program, &ctx->instructions);
379 for (unsigned i = 0; i < size; i++) {
380 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst+i}, v1), Operand(PhysReg{src0+i}, v1),
381 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
382 }
383 }
384
385 uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
386 {
387 switch (op) {
388 case iadd8:
389 case iadd16:
390 case iadd32:
391 case iadd64:
392 case fadd16:
393 case fadd32:
394 case fadd64:
395 case ior8:
396 case ior16:
397 case ior32:
398 case ior64:
399 case ixor8:
400 case ixor16:
401 case ixor32:
402 case ixor64:
403 case umax8:
404 case umax16:
405 case umax32:
406 case umax64:
407 return 0;
408 case imul8:
409 case imul16:
410 case imul32:
411 case imul64:
412 return idx ? 0 : 1;
413 case fmul16:
414 return 0x3c00u; /* 1.0 */
415 case fmul32:
416 return 0x3f800000u; /* 1.0 */
417 case fmul64:
418 return idx ? 0x3ff00000u : 0u; /* 1.0 */
419 case imin8:
420 return INT8_MAX;
421 case imin16:
422 return INT16_MAX;
423 case imin32:
424 return INT32_MAX;
425 case imin64:
426 return idx ? 0x7fffffffu : 0xffffffffu;
427 case imax8:
428 return INT8_MIN;
429 case imax16:
430 return INT16_MIN;
431 case imax32:
432 return INT32_MIN;
433 case imax64:
434 return idx ? 0x80000000u : 0;
435 case umin8:
436 case umin16:
437 case iand8:
438 case iand16:
439 return 0xffffffffu;
440 case umin32:
441 case umin64:
442 case iand32:
443 case iand64:
444 return 0xffffffffu;
445 case fmin16:
446 return 0x7c00u; /* infinity */
447 case fmin32:
448 return 0x7f800000u; /* infinity */
449 case fmin64:
450 return idx ? 0x7ff00000u : 0u; /* infinity */
451 case fmax16:
452 return 0xfc00u; /* negative infinity */
453 case fmax32:
454 return 0xff800000u; /* negative infinity */
455 case fmax64:
456 return idx ? 0xfff00000u : 0u; /* negative infinity */
457 default:
458 unreachable("Invalid reduction operation");
459 break;
460 }
461 return 0;
462 }
463
464 void emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
465 {
466 for (unsigned i = 0; i < size; i++) {
467 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst+i}, v1),
468 Operand(PhysReg{src+i}, v1), ds_pattern);
469 }
470 }
471
472 void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp,
473 PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
474 {
475 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
476 assert(cluster_size <= ctx->program->wave_size);
477
478 Builder bld(ctx->program, &ctx->instructions);
479
480 Operand identity[2];
481 identity[0] = Operand(get_reduction_identity(reduce_op, 0));
482 identity[1] = Operand(get_reduction_identity(reduce_op, 1));
483 Operand vcndmask_identity[2] = {identity[0], identity[1]};
484
485 /* First, copy the source to tmp and set inactive lanes to the identity */
486 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm));
487
488 for (unsigned i = 0; i < src.size(); i++) {
489 /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
490 * except on GFX10, where v_writelane_b32 can take a literal. */
491 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
492 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
493 identity[i] = Operand(PhysReg{sitmp+i}, s1);
494
495 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
496 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
497 } else if (identity[i].isLiteral()) {
498 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
499 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
500 }
501 }
502
503 for (unsigned i = 0; i < src.size(); i++) {
504 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
505 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
506 Operand(stmp, bld.lm));
507 }
508
509 bool reduction_needs_last_op = false;
510 switch (op) {
511 case aco_opcode::p_reduce:
512 if (cluster_size == 1) break;
513
514 if (ctx->program->chip_class <= GFX7) {
515 reduction_needs_last_op = true;
516 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
517 if (cluster_size == 2) break;
518 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
519 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
520 if (cluster_size == 4) break;
521 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
522 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
523 if (cluster_size == 8) break;
524 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
525 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
526 if (cluster_size == 16) break;
527 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
528 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
529 if (cluster_size == 32) break;
530 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
531 for (unsigned i = 0; i < src.size(); i++)
532 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), Operand(0u));
533 // TODO: it would be more effective to do the last reduction step on SALU
534 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
535 reduction_needs_last_op = false;
536 break;
537 }
538
539 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false);
540 if (cluster_size == 2) break;
541 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false);
542 if (cluster_size == 4) break;
543 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf, false);
544 if (cluster_size == 8) break;
545 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
546 if (cluster_size == 16) break;
547
548 if (ctx->program->chip_class >= GFX10) {
549 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
550 for (unsigned i = 0; i < src.size(); i++)
551 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
552
553 if (cluster_size == 32) {
554 reduction_needs_last_op = true;
555 break;
556 }
557
558 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
559 for (unsigned i = 0; i < src.size(); i++)
560 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
561 // TODO: it would be more effective to do the last reduction step on SALU
562 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
563 break;
564 }
565
566 if (cluster_size == 32) {
567 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
568 reduction_needs_last_op = true;
569 break;
570 }
571 assert(cluster_size == 64);
572 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, false);
573 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, false);
574 break;
575 case aco_opcode::p_exclusive_scan:
576 if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
577 /* shift rows right */
578 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
579
580 /* fill in the gaps in rows 1 and 3 */
581 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
582 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
583 for (unsigned i = 0; i < src.size(); i++) {
584 Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
585 Definition(PhysReg{vtmp+i}, v1),
586 Operand(PhysReg{tmp+i}, v1),
587 Operand(0xffffffffu), Operand(0xffffffffu)).instr;
588 static_cast<VOP3A_instruction*>(perm)->opsel = 1; /* FI (Fetch Inactive) */
589 }
590 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX));
591
592 if (ctx->program->wave_size == 64) {
593 /* fill in the gap in row 2 */
594 for (unsigned i = 0; i < src.size(); i++) {
595 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
596 bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
597 }
598 }
599 std::swap(tmp, vtmp);
600 } else if (ctx->program->chip_class >= GFX8) {
601 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
602 } else {
603 // TODO: use LDS on CS with a single write and shifted read
604 /* wavefront shift_right by 1 on SI/CI */
605 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
606 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
607 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10101010u));
608 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
609 for (unsigned i = 0; i < src.size(); i++)
610 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
611
612 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
613 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
614 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x01000100u));
615 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
616 for (unsigned i = 0; i < src.size(); i++)
617 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
618
619 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
620 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
621 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(1u), Operand(16u));
622 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(1u), Operand(16u));
623 for (unsigned i = 0; i < src.size(); i++)
624 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
625
626 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
627 for (unsigned i = 0; i < src.size(); i++) {
628 bld.writelane(Definition(PhysReg{vtmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{vtmp+i}, v1));
629 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
630 bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
631 identity[i] = Operand(0u); /* prevent further uses of identity */
632 }
633 std::swap(tmp, vtmp);
634 }
635
636 for (unsigned i = 0; i < src.size(); i++) {
637 if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
638 if (ctx->program->chip_class < GFX10)
639 assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
640 bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));
641 }
642 }
643 /* fall through */
644 case aco_opcode::p_inclusive_scan:
645 assert(cluster_size == ctx->program->wave_size);
646 if (ctx->program->chip_class <= GFX7) {
647 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
648 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xAAAAAAAAu));
649 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
650 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
651
652 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
653 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
654 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xCCCCCCCCu));
655 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
656 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
657
658 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
659 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
660 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xF0F0F0F0u));
661 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
662 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
663
664 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
665 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
666 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xFF00FF00u));
667 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
668 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
669
670 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
671 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
672 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u));
673 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u));
674 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
675
676 for (unsigned i = 0; i < src.size(); i++)
677 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
678 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
679 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
680 break;
681 }
682
683 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
684 dpp_row_sr(1), 0xf, 0xf, false, identity);
685 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
686 dpp_row_sr(2), 0xf, 0xf, false, identity);
687 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
688 dpp_row_sr(4), 0xf, 0xf, false, identity);
689 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
690 dpp_row_sr(8), 0xf, 0xf, false, identity);
691 if (ctx->program->chip_class >= GFX10) {
692 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u));
693 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u));
694 for (unsigned i = 0; i < src.size(); i++) {
695 Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
696 Definition(PhysReg{vtmp+i}, v1),
697 Operand(PhysReg{tmp+i}, v1),
698 Operand(0xffffffffu), Operand(0xffffffffu)).instr;
699 static_cast<VOP3A_instruction*>(perm)->opsel = 1; /* FI (Fetch Inactive) */
700 }
701 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
702
703 if (ctx->program->wave_size == 64) {
704 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
705 for (unsigned i = 0; i < src.size(); i++)
706 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
707 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
708 }
709 } else {
710 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
711 dpp_row_bcast15, 0xa, 0xf, false, identity);
712 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
713 dpp_row_bcast31, 0xc, 0xf, false, identity);
714 }
715 break;
716 default:
717 unreachable("Invalid reduction mode");
718 }
719
720
721 if (op == aco_opcode::p_reduce) {
722 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
723 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
724 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
725 return;
726 }
727
728 if (reduction_needs_last_op)
729 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
730 }
731
732 /* restore exec */
733 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
734
735 if (dst.regClass().type() == RegType::sgpr) {
736 for (unsigned k = 0; k < src.size(); k++) {
737 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1),
738 Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
739 }
740 } else if (dst.physReg() != tmp) {
741 for (unsigned k = 0; k < src.size(); k++) {
742 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
743 Operand(PhysReg{tmp + k}, v1));
744 }
745 }
746 }
747
748 struct copy_operation {
749 Operand op;
750 Definition def;
751 unsigned bytes;
752 union {
753 uint8_t uses[8];
754 uint64_t is_used = 0;
755 };
756 };
757
758 void split_copy(unsigned offset, Definition *def, Operand *op, const copy_operation& src, bool ignore_uses, unsigned max_size)
759 {
760 PhysReg def_reg = src.def.physReg();
761 PhysReg op_reg = src.op.physReg();
762 def_reg.reg_b += offset;
763 op_reg.reg_b += offset;
764
765 max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8);
766
767 /* make sure the size is a power of two and reg % bytes == 0 */
768 unsigned bytes = 1;
769 for (; bytes <= max_size; bytes *= 2) {
770 unsigned next = bytes * 2u;
771 bool can_increase = def_reg.reg_b % next == 0 &&
772 offset + next <= src.bytes && next <= max_size;
773 if (!src.op.isConstant() && can_increase)
774 can_increase = op_reg.reg_b % next == 0;
775 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
776 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
777 if (!can_increase)
778 break;
779 }
780
781 RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u) :
782 RegClass(src.def.regClass().type(), bytes).as_subdword();
783 *def = Definition(src.def.tempId(), def_reg, def_cls);
784 if (src.op.isConstant()) {
785 assert(offset == 0 || (offset == 4 && src.op.bytes() == 8));
786 if (src.op.bytes() == 8 && bytes == 4)
787 *op = Operand(uint32_t(src.op.constantValue64() >> (offset * 8u)));
788 else
789 *op = src.op;
790 } else {
791 RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) :
792 RegClass(src.op.regClass().type(), bytes).as_subdword();
793 *op = Operand(op_reg, op_cls);
794 op->setTemp(Temp(src.op.tempId(), op_cls));
795 }
796 }
797
798 uint32_t get_intersection_mask(int a_start, int a_size,
799 int b_start, int b_size)
800 {
801 int intersection_start = MAX2(b_start - a_start, 0);
802 int intersection_end = MAX2(b_start + b_size - a_start, 0);
803 if (intersection_start >= a_size || intersection_end == 0)
804 return 0;
805
806 uint32_t mask = u_bit_consecutive(0, a_size);
807 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
808 }
809
810 bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc)
811 {
812 bool did_copy = false;
813 for (unsigned offset = 0; offset < copy.bytes;) {
814 if (copy.uses[offset]) {
815 offset++;
816 continue;
817 }
818
819 Definition def;
820 Operand op;
821 split_copy(offset, &def, &op, copy, false, 8);
822
823 if (def.physReg() == scc) {
824 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand(0u));
825 *preserve_scc = true;
826 } else if (def.bytes() == 8 && def.getTemp().type() == RegType::sgpr) {
827 bld.sop1(aco_opcode::s_mov_b64, def, Operand(op.physReg(), s2));
828 } else {
829 bld.copy(def, op);
830 }
831
832 ctx->program->statistics[statistic_copies]++;
833
834 did_copy = true;
835 offset += def.bytes();
836 }
837 return did_copy;
838 }
839
840 void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, Pseudo_instruction *pi)
841 {
842 unsigned offset = 0;
843
844 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
845 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
846 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte swap */
847 PhysReg op = copy.op.physReg();
848 PhysReg def = copy.def.physReg();
849 op.reg_b &= ~0x3;
850 def.reg_b &= ~0x3;
851
852 copy_operation tmp;
853 tmp.op = Operand(op, v1);
854 tmp.def = Definition(def, v1);
855 tmp.bytes = 4;
856 memset(tmp.uses, 1, 4);
857 do_swap(ctx, bld, tmp, preserve_scc, pi);
858
859 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
860 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
861 tmp.op = Operand(op, v1b);
862 tmp.def = Definition(def, v1b);
863 tmp.bytes = 1;
864 tmp.uses[0] = 1;
865 do_swap(ctx, bld, tmp, preserve_scc, pi);
866
867 offset = copy.bytes;
868 }
869
870 for (; offset < copy.bytes;) {
871 Definition def;
872 Operand op;
873 split_copy(offset, &def, &op, copy, true, 8);
874
875 assert(op.regClass() == def.regClass());
876 Operand def_as_op = Operand(def.physReg(), def.regClass());
877 Definition op_as_def = Definition(op.physReg(), op.regClass());
878 if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) {
879 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
880 ctx->program->statistics[statistic_copies]++;
881 } else if (def.regClass() == v1) {
882 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
883 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
884 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
885 ctx->program->statistics[statistic_copies] += 3;
886 } else if (op.physReg() == scc || def.physReg() == scc) {
887 /* we need to swap scc and another sgpr */
888 assert(!preserve_scc);
889
890 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
891
892 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
893 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u));
894 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
895 ctx->program->statistics[statistic_copies] += 3;
896 } else if (def.regClass() == s1) {
897 if (preserve_scc) {
898 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
899 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
900 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
901 } else {
902 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
903 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
904 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
905 }
906 ctx->program->statistics[statistic_copies] += 3;
907 } else if (def.regClass() == s2) {
908 if (preserve_scc)
909 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
910 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
911 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
912 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
913 if (preserve_scc)
914 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), Operand(0u));
915 ctx->program->statistics[statistic_copies] += 3;
916 } else if (ctx->program->chip_class >= GFX9 && def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
917 aco_ptr<VOP3P_instruction> vop3p{create_instruction<VOP3P_instruction>(aco_opcode::v_pk_add_u16, Format::VOP3P, 2, 1)};
918 vop3p->operands[0] = Operand(PhysReg{op.physReg().reg()}, v1);
919 vop3p->operands[1] = Operand(0u);
920 vop3p->definitions[0] = Definition(PhysReg{op.physReg().reg()}, v1);
921 vop3p->opsel_lo = 0x1;
922 vop3p->opsel_hi = 0x2;
923 bld.insert(std::move(vop3p));
924 } else {
925 assert(def.regClass().is_subdword());
926 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
927 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
928 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
929 ctx->program->statistics[statistic_copies] += 3;
930 }
931
932 offset += def.bytes();
933 }
934
935 /* fixup in case we swapped bytes we shouldn't have */
936 copy_operation tmp_copy = copy;
937 tmp_copy.op.setFixed(copy.def.physReg());
938 tmp_copy.def.setFixed(copy.op.physReg());
939 do_copy(ctx, bld, tmp_copy, &preserve_scc);
940 }
941
942 void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi)
943 {
944 Builder bld(ctx->program, &ctx->instructions);
945 aco_ptr<Instruction> mov;
946 std::map<PhysReg, copy_operation>::iterator it = copy_map.begin();
947 std::map<PhysReg, copy_operation>::iterator target;
948 bool writes_scc = false;
949
950 /* count the number of uses for each dst reg */
951 while (it != copy_map.end()) {
952
953 if (it->second.def.physReg() == scc)
954 writes_scc = true;
955
956 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
957
958 /* if src and dst reg are the same, remove operation */
959 if (it->first == it->second.op.physReg()) {
960 it = copy_map.erase(it);
961 continue;
962 }
963
964 /* split large copies */
965 if (it->second.bytes > 8) {
966 assert(!it->second.op.isConstant());
967 assert(!it->second.def.regClass().is_subdword());
968 RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 2);
969 Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
970 rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 2);
971 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
972 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
973 copy_map[hi_def.physReg()] = copy;
974 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
975 it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);
976 it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);
977 it->second.bytes = 8;
978 }
979
980 /* check if the definition reg is used by another copy operation */
981 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
982 if (copy.second.op.isConstant())
983 continue;
984 for (uint16_t i = 0; i < it->second.bytes; i++) {
985 /* distance might underflow */
986 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
987 if (distance < copy.second.bytes)
988 it->second.uses[i] += 1;
989 }
990 }
991
992 ++it;
993 }
994
995 /* first, handle paths in the location transfer graph */
996 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
997 it = copy_map.begin();
998 while (it != copy_map.end()) {
999
1000 /* try to coalesce 32-bit sgpr copies to 64-bit copies */
1001 if (it->second.is_used == 0 &&
1002 it->second.def.getTemp().type() == RegType::sgpr && it->second.bytes == 4 &&
1003 !it->second.op.isConstant() && it->first % 2 == it->second.op.physReg() % 2) {
1004
1005 PhysReg other_def_reg = PhysReg{it->first % 2 ? it->first - 1 : it->first + 1};
1006 PhysReg other_op_reg = PhysReg{it->first % 2 ? it->second.op.physReg() - 1 : it->second.op.physReg() + 1};
1007 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(other_def_reg);
1008
1009 if (other != copy_map.end() && !other->second.is_used && other->second.bytes == 4 &&
1010 other->second.op.physReg() == other_op_reg && !other->second.op.isConstant()) {
1011 std::map<PhysReg, copy_operation>::iterator to_erase = it->first % 2 ? it : other;
1012 it = it->first % 2 ? other : it;
1013 copy_map.erase(to_erase);
1014 it->second.bytes = 8;
1015 }
1016 }
1017 // TODO: try to coalesce subdword copies
1018
1019 /* find portions where the target reg is not used as operand for any other copy */
1020 if (it->second.is_used) {
1021 if (it->second.op.isConstant()) {
1022 /* we have to skip constants until is_used=0 */
1023 ++it;
1024 continue;
1025 }
1026
1027 unsigned has_zero_use_bytes = 0;
1028 for (unsigned i = 0; i < it->second.bytes; i++)
1029 has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1030
1031 if (has_zero_use_bytes) {
1032 /* Skipping partial copying and doing a v_swap_b32 and then fixup
1033 * copies is usually beneficial for sub-dword copies, but if doing
1034 * a partial copy allows further copies, it should be done instead. */
1035 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1036 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1037 if (partial_copy)
1038 break;
1039 for (uint16_t i = 0; i < copy.second.bytes; i++) {
1040 /* distance might underflow */
1041 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1042 if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1043 !it->second.uses[distance])
1044 partial_copy = true;
1045 }
1046 }
1047
1048 if (!partial_copy) {
1049 ++it;
1050 continue;
1051 }
1052 } else {
1053 /* full target reg is used: register swapping needed */
1054 ++it;
1055 continue;
1056 }
1057 }
1058
1059 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc);
1060
1061 std::pair<PhysReg, copy_operation> copy = *it;
1062
1063 if (it->second.is_used == 0) {
1064 /* the target reg is not used as operand for any other copy, so we
1065 * copied to all of it */
1066 copy_map.erase(it);
1067 it = copy_map.begin();
1068 } else {
1069 /* we only performed some portions of this copy, so split it to only
1070 * leave the portions that still need to be done */
1071 copy_operation original = it->second; /* the map insertion below can overwrite this */
1072 copy_map.erase(it);
1073 for (unsigned offset = 0; offset < original.bytes;) {
1074 if (original.uses[offset] == 0) {
1075 offset++;
1076 continue;
1077 }
1078 Definition def;
1079 Operand op;
1080 split_copy(offset, &def, &op, original, false, 8);
1081
1082 copy_operation copy = {op, def, def.bytes()};
1083 for (unsigned i = 0; i < copy.bytes; i++)
1084 copy.uses[i] = original.uses[i + offset];
1085 copy_map[def.physReg()] = copy;
1086
1087 offset += def.bytes();
1088 }
1089
1090 it = copy_map.begin();
1091 }
1092
1093 /* Reduce the number of uses of the operand reg by one. Do this after
1094 * splitting the copy or removing it in case the copy writes to it's own
1095 * operand (for example, v[7:8] = v[8:9]) */
1096 if (did_copy && !copy.second.op.isConstant()) {
1097 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1098 for (uint16_t i = 0; i < other.second.bytes; i++) {
1099 /* distance might underflow */
1100 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
1101 if (distance < copy.second.bytes && !copy.second.uses[distance])
1102 other.second.uses[i] -= 1;
1103 }
1104 }
1105 }
1106 }
1107
1108 if (copy_map.empty())
1109 return;
1110
1111 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
1112 unsigned largest = 0;
1113 for (const std::pair<PhysReg, copy_operation>& op : copy_map)
1114 largest = MAX2(largest, op.second.bytes);
1115
1116 while (!copy_map.empty()) {
1117
1118 /* Perform larger swaps first, because larger swaps swaps can make other
1119 * swaps unnecessary. */
1120 auto it = copy_map.begin();
1121 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
1122 if (it2->second.bytes > it->second.bytes) {
1123 it = it2;
1124 if (it->second.bytes == largest)
1125 break;
1126 }
1127 }
1128
1129 /* should already be done */
1130 assert(!it->second.op.isConstant());
1131
1132 assert(it->second.op.isFixed());
1133 assert(it->second.def.regClass() == it->second.op.regClass());
1134
1135 if (it->first == it->second.op.physReg()) {
1136 copy_map.erase(it);
1137 continue;
1138 }
1139
1140 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
1141 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
1142
1143 /* to resolve the cycle, we have to swap the src reg with the dst reg */
1144 copy_operation swap = it->second;
1145
1146 /* if this is self-intersecting, we have to split it because
1147 * self-intersecting swaps don't make sense */
1148 PhysReg lower = swap.def.physReg();
1149 PhysReg higher = swap.op.physReg();
1150 if (lower.reg_b > higher.reg_b)
1151 std::swap(lower, higher);
1152 if (higher.reg_b - lower.reg_b < (int)swap.bytes) {
1153 unsigned offset = higher.reg_b - lower.reg_b;
1154 RegType type = swap.def.regClass().type();
1155
1156 copy_operation middle;
1157 lower.reg_b += offset;
1158 higher.reg_b += offset;
1159 middle.bytes = swap.bytes - offset * 2;
1160 memcpy(middle.uses, swap.uses + offset, middle.bytes);
1161 middle.op = Operand(lower, RegClass::get(type, middle.bytes));
1162 middle.def = Definition(higher, RegClass::get(type, middle.bytes));
1163 copy_map[higher] = middle;
1164
1165 copy_operation end;
1166 lower.reg_b += middle.bytes;
1167 higher.reg_b += middle.bytes;
1168 end.bytes = swap.bytes - (offset + middle.bytes);
1169 memcpy(end.uses, swap.uses + offset + middle.bytes, end.bytes);
1170 end.op = Operand(lower, RegClass::get(type, end.bytes));
1171 end.def = Definition(higher, RegClass::get(type, end.bytes));
1172 copy_map[higher] = end;
1173
1174 memset(swap.uses + offset, 0, swap.bytes - offset);
1175 swap.bytes = offset;
1176 }
1177
1178 do_swap(ctx, bld, swap, preserve_scc, pi);
1179
1180 /* remove from map */
1181 copy_map.erase(it);
1182
1183 /* change the operand reg of the target's uses and split uses if needed */
1184 target = copy_map.begin();
1185 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
1186 for (; target != copy_map.end(); ++target) {
1187 if (target->second.op.physReg() == swap.def.physReg() && swap.bytes == target->second.bytes) {
1188 target->second.op.setFixed(swap.op.physReg());
1189 break;
1190 }
1191
1192 uint32_t imask = get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
1193 target->second.op.physReg().reg_b, target->second.bytes);
1194
1195 if (!imask)
1196 continue;
1197
1198 assert(target->second.bytes < swap.bytes);
1199
1200 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
1201
1202 /* split and update the middle (the portion that reads the swap's
1203 * definition) to read the swap's operand instead */
1204 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
1205 int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
1206 int before_bytes = MAX2(-offset, 0);
1207 int after_bytes = MAX2(target_op_end - swap_def_end, 0);
1208 int middle_bytes = target->second.bytes - before_bytes - after_bytes;
1209
1210 if (after_bytes) {
1211 unsigned after_offset = before_bytes + middle_bytes;
1212 assert(after_offset > 0);
1213 copy_operation copy;
1214 copy.bytes = after_bytes;
1215 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
1216 RegClass rc = RegClass::get(target->second.op.regClass().type(), after_bytes);
1217 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
1218 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
1219 copy_map[copy.def.physReg()] = copy;
1220 }
1221
1222 if (middle_bytes) {
1223 copy_operation copy;
1224 copy.bytes = middle_bytes;
1225 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
1226 RegClass rc = RegClass::get(target->second.op.regClass().type(), middle_bytes);
1227 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
1228 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
1229 copy_map[copy.def.physReg()] = copy;
1230 }
1231
1232 if (before_bytes) {
1233 copy_operation copy;
1234 target->second.bytes = before_bytes;
1235 RegClass rc = RegClass::get(target->second.op.regClass().type(), before_bytes);
1236 target->second.op = Operand(target->second.op.physReg(), rc);
1237 target->second.def = Definition(target->second.def.physReg(), rc);
1238 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
1239 }
1240
1241 /* break early since we know each byte of the swap's definition is used
1242 * at most once */
1243 bytes_left &= ~imask;
1244 if (!bytes_left)
1245 break;
1246 }
1247 }
1248 }
1249
1250 void lower_to_hw_instr(Program* program)
1251 {
1252 Block *discard_block = NULL;
1253
1254 for (size_t i = 0; i < program->blocks.size(); i++)
1255 {
1256 Block *block = &program->blocks[i];
1257 lower_context ctx;
1258 ctx.program = program;
1259 Builder bld(program, &ctx.instructions);
1260
1261 bool set_mode = i == 0 && block->fp_mode.val != program->config->float_mode;
1262 for (unsigned pred : block->linear_preds) {
1263 if (program->blocks[pred].fp_mode.val != block->fp_mode.val) {
1264 set_mode = true;
1265 break;
1266 }
1267 }
1268 if (set_mode) {
1269 /* only allow changing modes at top-level blocks so this doesn't break
1270 * the "jump over empty blocks" optimization */
1271 assert(block->kind & block_kind_top_level);
1272 uint32_t mode = block->fp_mode.val;
1273 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
1274 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(mode), (7 << 11) | 1);
1275 }
1276
1277 for (size_t j = 0; j < block->instructions.size(); j++) {
1278 aco_ptr<Instruction>& instr = block->instructions[j];
1279 aco_ptr<Instruction> mov;
1280 if (instr->format == Format::PSEUDO) {
1281 Pseudo_instruction *pi = (Pseudo_instruction*)instr.get();
1282
1283 switch (instr->opcode)
1284 {
1285 case aco_opcode::p_extract_vector:
1286 {
1287 PhysReg reg = instr->operands[0].physReg();
1288 Definition& def = instr->definitions[0];
1289 reg.reg_b += instr->operands[1].constantValue() * def.bytes();
1290
1291 if (reg == def.physReg())
1292 break;
1293
1294 RegClass op_rc = def.regClass().is_subdword() ? def.regClass() :
1295 RegClass(instr->operands[0].getTemp().type(), def.size());
1296 std::map<PhysReg, copy_operation> copy_operations;
1297 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
1298 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1299 break;
1300 }
1301 case aco_opcode::p_create_vector:
1302 {
1303 std::map<PhysReg, copy_operation> copy_operations;
1304 PhysReg reg = instr->definitions[0].physReg();
1305
1306 for (const Operand& op : instr->operands) {
1307 if (op.isConstant()) {
1308 const Definition def = Definition(reg, RegClass(instr->definitions[0].getTemp().type(), op.size()));
1309 copy_operations[reg] = {op, def, op.bytes()};
1310 reg.reg_b += op.bytes();
1311 continue;
1312 }
1313 if (op.isUndefined()) {
1314 // TODO: coalesce subdword copies if dst byte is 0
1315 reg.reg_b += op.bytes();
1316 continue;
1317 }
1318
1319 RegClass rc_def = op.regClass().is_subdword() ? op.regClass() :
1320 RegClass(instr->definitions[0].getTemp().type(), op.size());
1321 const Definition def = Definition(reg, rc_def);
1322 copy_operations[def.physReg()] = {op, def, op.bytes()};
1323 reg.reg_b += op.bytes();
1324 }
1325 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1326 break;
1327 }
1328 case aco_opcode::p_split_vector:
1329 {
1330 std::map<PhysReg, copy_operation> copy_operations;
1331 PhysReg reg = instr->operands[0].physReg();
1332
1333 for (const Definition& def : instr->definitions) {
1334 RegClass rc_op = def.regClass().is_subdword() ? def.regClass() :
1335 RegClass(instr->operands[0].getTemp().type(), def.size());
1336 const Operand op = Operand(reg, rc_op);
1337 copy_operations[def.physReg()] = {op, def, def.bytes()};
1338 reg.reg_b += def.bytes();
1339 }
1340 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1341 break;
1342 }
1343 case aco_opcode::p_parallelcopy:
1344 case aco_opcode::p_wqm:
1345 {
1346 std::map<PhysReg, copy_operation> copy_operations;
1347 for (unsigned i = 0; i < instr->operands.size(); i++) {
1348 assert(instr->definitions[i].bytes() == instr->operands[i].bytes());
1349 copy_operations[instr->definitions[i].physReg()] = {instr->operands[i], instr->definitions[i], instr->operands[i].bytes()};
1350 }
1351 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1352 break;
1353 }
1354 case aco_opcode::p_exit_early_if:
1355 {
1356 /* don't bother with an early exit near the end of the program */
1357 if ((block->instructions.size() - 1 - j) <= 4 &&
1358 block->instructions.back()->opcode == aco_opcode::s_endpgm) {
1359 unsigned null_exp_dest = (ctx.program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
1360 bool ignore_early_exit = true;
1361
1362 for (unsigned k = j + 1; k < block->instructions.size(); ++k) {
1363 const aco_ptr<Instruction> &instr = block->instructions[k];
1364 if (instr->opcode == aco_opcode::s_endpgm ||
1365 instr->opcode == aco_opcode::p_logical_end)
1366 continue;
1367 else if (instr->opcode == aco_opcode::exp &&
1368 static_cast<Export_instruction *>(instr.get())->dest == null_exp_dest)
1369 continue;
1370 else if (instr->opcode == aco_opcode::p_parallelcopy &&
1371 instr->definitions[0].isFixed() &&
1372 instr->definitions[0].physReg() == exec)
1373 continue;
1374
1375 ignore_early_exit = false;
1376 }
1377
1378 if (ignore_early_exit)
1379 break;
1380 }
1381
1382 if (!discard_block) {
1383 discard_block = program->create_and_insert_block();
1384 block = &program->blocks[i];
1385
1386 bld.reset(discard_block);
1387 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
1388 0, V_008DFC_SQ_EXP_NULL, false, true, true);
1389 if (program->wb_smem_l1_on_end)
1390 bld.smem(aco_opcode::s_dcache_wb);
1391 bld.sopp(aco_opcode::s_endpgm);
1392
1393 bld.reset(&ctx.instructions);
1394 }
1395
1396 //TODO: exec can be zero here with block_kind_discard
1397
1398 assert(instr->operands[0].physReg() == scc);
1399 bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
1400
1401 discard_block->linear_preds.push_back(block->index);
1402 block->linear_succs.push_back(discard_block->index);
1403 break;
1404 }
1405 case aco_opcode::p_spill:
1406 {
1407 assert(instr->operands[0].regClass() == v1.as_linear());
1408 for (unsigned i = 0; i < instr->operands[2].size(); i++)
1409 bld.writelane(bld.def(v1, instr->operands[0].physReg()),
1410 Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
1411 Operand(instr->operands[1].constantValue() + i),
1412 instr->operands[0]);
1413 break;
1414 }
1415 case aco_opcode::p_reload:
1416 {
1417 assert(instr->operands[0].regClass() == v1.as_linear());
1418 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1419 bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
1420 instr->operands[0],
1421 Operand(instr->operands[1].constantValue() + i));
1422 break;
1423 }
1424 case aco_opcode::p_as_uniform:
1425 {
1426 if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) {
1427 std::map<PhysReg, copy_operation> copy_operations;
1428 copy_operations[instr->definitions[0].physReg()] = {instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
1429 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1430 } else {
1431 assert(instr->operands[0].regClass().type() == RegType::vgpr);
1432 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
1433 assert(instr->operands[0].size() == instr->definitions[0].size());
1434 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
1435 bld.vop1(aco_opcode::v_readfirstlane_b32,
1436 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
1437 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
1438 }
1439 }
1440 break;
1441 }
1442 default:
1443 break;
1444 }
1445 } else if (instr->format == Format::PSEUDO_BRANCH) {
1446 Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(instr.get());
1447 /* check if all blocks from current to target are empty */
1448 bool can_remove = block->index < branch->target[0];
1449 for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) {
1450 if (program->blocks[i].instructions.size())
1451 can_remove = false;
1452 }
1453 if (can_remove)
1454 continue;
1455
1456 switch (instr->opcode) {
1457 case aco_opcode::p_branch:
1458 assert(block->linear_succs[0] == branch->target[0]);
1459 bld.sopp(aco_opcode::s_branch, branch->target[0]);
1460 break;
1461 case aco_opcode::p_cbranch_nz:
1462 assert(block->linear_succs[1] == branch->target[0]);
1463 if (branch->operands[0].physReg() == exec)
1464 bld.sopp(aco_opcode::s_cbranch_execnz, branch->target[0]);
1465 else if (branch->operands[0].physReg() == vcc)
1466 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->target[0]);
1467 else {
1468 assert(branch->operands[0].physReg() == scc);
1469 bld.sopp(aco_opcode::s_cbranch_scc1, branch->target[0]);
1470 }
1471 break;
1472 case aco_opcode::p_cbranch_z:
1473 assert(block->linear_succs[1] == branch->target[0]);
1474 if (branch->operands[0].physReg() == exec)
1475 bld.sopp(aco_opcode::s_cbranch_execz, branch->target[0]);
1476 else if (branch->operands[0].physReg() == vcc)
1477 bld.sopp(aco_opcode::s_cbranch_vccz, branch->target[0]);
1478 else {
1479 assert(branch->operands[0].physReg() == scc);
1480 bld.sopp(aco_opcode::s_cbranch_scc0, branch->target[0]);
1481 }
1482 break;
1483 default:
1484 unreachable("Unknown Pseudo branch instruction!");
1485 }
1486
1487 } else if (instr->format == Format::PSEUDO_REDUCTION) {
1488 Pseudo_reduction_instruction* reduce = static_cast<Pseudo_reduction_instruction*>(instr.get());
1489 if (reduce->reduce_op == gfx10_wave64_bpermute) {
1490 /* Only makes sense on GFX10 wave64 */
1491 assert(program->chip_class >= GFX10);
1492 assert(program->info->wave_size == 64);
1493 assert(instr->definitions[0].regClass() == v1); /* Destination */
1494 assert(instr->definitions[1].regClass() == s2); /* Temp EXEC */
1495 assert(instr->definitions[1].physReg() != vcc);
1496 assert(instr->definitions[2].physReg() == scc); /* SCC clobber */
1497 assert(instr->operands[0].physReg() == vcc); /* Compare */
1498 assert(instr->operands[1].regClass() == v2.as_linear()); /* Temp VGPR pair */
1499 assert(instr->operands[2].regClass() == v1); /* Indices x4 */
1500 assert(instr->operands[3].regClass() == v1); /* Input data */
1501
1502 PhysReg shared_vgpr_reg_lo = PhysReg(align(program->config->num_vgprs, 4) + 256);
1503 PhysReg shared_vgpr_reg_hi = PhysReg(shared_vgpr_reg_lo + 1);
1504 Operand compare = instr->operands[0];
1505 Operand tmp1(instr->operands[1].physReg(), v1);
1506 Operand tmp2(PhysReg(instr->operands[1].physReg() + 1), v1);
1507 Operand index_x4 = instr->operands[2];
1508 Operand input_data = instr->operands[3];
1509 Definition shared_vgpr_lo(shared_vgpr_reg_lo, v1);
1510 Definition shared_vgpr_hi(shared_vgpr_reg_hi, v1);
1511 Definition def_temp1(tmp1.physReg(), v1);
1512 Definition def_temp2(tmp2.physReg(), v1);
1513
1514 /* Save EXEC and set it for all lanes */
1515 bld.sop1(aco_opcode::s_or_saveexec_b64, instr->definitions[1], instr->definitions[2],
1516 Definition(exec, s2), Operand((uint64_t)-1), Operand(exec, s2));
1517
1518 /* HI: Copy data from high lanes 32-63 to shared vgpr */
1519 bld.vop1_dpp(aco_opcode::v_mov_b32, shared_vgpr_hi, input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1520
1521 /* LO: Copy data from low lanes 0-31 to shared vgpr */
1522 bld.vop1_dpp(aco_opcode::v_mov_b32, shared_vgpr_lo, input_data, dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false);
1523 /* LO: Copy shared vgpr (high lanes' data) to output vgpr */
1524 bld.vop1_dpp(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false);
1525
1526 /* HI: Copy shared vgpr (low lanes' data) to output vgpr */
1527 bld.vop1_dpp(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1528
1529 /* Permute the original input */
1530 bld.ds(aco_opcode::ds_bpermute_b32, def_temp2, index_x4, input_data);
1531 /* Permute the swapped input */
1532 bld.ds(aco_opcode::ds_bpermute_b32, def_temp1, index_x4, tmp1);
1533
1534 /* Restore saved EXEC */
1535 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(instr->definitions[1].physReg(), s2));
1536 /* Choose whether to use the original or swapped */
1537 bld.vop2(aco_opcode::v_cndmask_b32, instr->definitions[0], tmp1, tmp2, compare);
1538 } else {
1539 emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size,
1540 reduce->operands[1].physReg(), // tmp
1541 reduce->definitions[1].physReg(), // stmp
1542 reduce->operands[2].physReg(), // vtmp
1543 reduce->definitions[2].physReg(), // sitmp
1544 reduce->operands[0], reduce->definitions[0]);
1545 }
1546 } else {
1547 ctx.instructions.emplace_back(std::move(instr));
1548 }
1549
1550 }
1551 block->instructions.swap(ctx.instructions);
1552 }
1553 }
1554
1555 }