aco: improve 8/16-bit constants
[mesa.git] / src / amd / compiler / aco_lower_to_hw_instr.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 *
26 */
27
28 #include <map>
29
30 #include "aco_ir.h"
31 #include "aco_builder.h"
32 #include "util/u_math.h"
33 #include "sid.h"
34 #include "vulkan/radv_shader.h"
35
36
37 namespace aco {
38
39 struct lower_context {
40 Program *program;
41 std::vector<aco_ptr<Instruction>> instructions;
42 };
43
44 /* used by handle_operands() indirectly through Builder::copy */
45 uint8_t int8_mul_table[512] = {
46 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 10, 1, 11,
47 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19, 1, 20, 1, 21,
48 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29, 1, 30, 1, 31,
49 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39, 1, 40, 1, 41,
50 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49, 1, 50, 1, 51,
51 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59, 1, 60, 1, 61,
52 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23, 2, 35, 11, 53,
53 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243, 2, 40, 3, 27,
54 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23, 2, 45, 7, 13,
55 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33, 2, 50, 7, 51,
56 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27, 2, 55, 3, 37,
57 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17, 2, 60, 9, 241,
58 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43, 5, 26, 9, 43,
59 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243, 4, 35, 3, 47,
60 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45, 3, 50, 7, 241,
61 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53, 4, 40, 7, 23,
62 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13, 5, 34, 3, 57,
63 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245, 3, 60, 5, 241,
64 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63, 5, 38, 5, 243,
65 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35, 4, 242, 5, 245,
66 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19, 5, 42, 3, 241,
67 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43, 3, 244,
68 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,
69 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247,
70 9, 55, 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247,
71 1, 248, 1, 249, 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255
72 };
73
74
75 aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
76 /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
77 * 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use
78 * DPP with the arithmetic instructions. This requires to sign-extend.
79 */
80 switch (op) {
81 case iadd8:
82 case iadd16:
83 if (chip >= GFX10) {
84 return aco_opcode::v_add_u32;
85 } else if (chip >= GFX8) {
86 return aco_opcode::v_add_u16;
87 } else {
88 return aco_opcode::v_add_co_u32;
89 }
90 break;
91 case imul8:
92 case imul16:
93 if (chip >= GFX10) {
94 return aco_opcode::v_mul_lo_u16_e64;
95 } else if (chip >= GFX8) {
96 return aco_opcode::v_mul_lo_u16;
97 } else {
98 return aco_opcode::v_mul_u32_u24;
99 }
100 break;
101 case fadd16: return aco_opcode::v_add_f16;
102 case fmul16: return aco_opcode::v_mul_f16;
103 case imax8:
104 case imax16:
105 if (chip >= GFX10) {
106 return aco_opcode::v_max_i32;
107 } else if (chip >= GFX8) {
108 return aco_opcode::v_max_i16;
109 } else {
110 return aco_opcode::v_max_i32;
111 }
112 break;
113 case imin8:
114 case imin16:
115 if (chip >= GFX10) {
116 return aco_opcode::v_min_i32;
117 } else if (chip >= GFX8) {
118 return aco_opcode::v_min_i16;
119 } else {
120 return aco_opcode::v_min_i32;
121 }
122 break;
123 case umin8:
124 case umin16:
125 if (chip >= GFX10) {
126 return aco_opcode::v_min_u32;
127 } else if (chip >= GFX8) {
128 return aco_opcode::v_min_u16;
129 } else {
130 return aco_opcode::v_min_u32;
131 }
132 break;
133 case umax8:
134 case umax16:
135 if (chip >= GFX10) {
136 return aco_opcode::v_max_u32;
137 } else if (chip >= GFX8) {
138 return aco_opcode::v_max_u16;
139 } else {
140 return aco_opcode::v_max_u32;
141 }
142 break;
143 case fmin16: return aco_opcode::v_min_f16;
144 case fmax16: return aco_opcode::v_max_f16;
145 case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
146 case imul32: return aco_opcode::v_mul_lo_u32;
147 case fadd32: return aco_opcode::v_add_f32;
148 case fmul32: return aco_opcode::v_mul_f32;
149 case imax32: return aco_opcode::v_max_i32;
150 case imin32: return aco_opcode::v_min_i32;
151 case umin32: return aco_opcode::v_min_u32;
152 case umax32: return aco_opcode::v_max_u32;
153 case fmin32: return aco_opcode::v_min_f32;
154 case fmax32: return aco_opcode::v_max_f32;
155 case iand8:
156 case iand16:
157 case iand32: return aco_opcode::v_and_b32;
158 case ixor8:
159 case ixor16:
160 case ixor32: return aco_opcode::v_xor_b32;
161 case ior8:
162 case ior16:
163 case ior32: return aco_opcode::v_or_b32;
164 case iadd64: return aco_opcode::num_opcodes;
165 case imul64: return aco_opcode::num_opcodes;
166 case fadd64: return aco_opcode::v_add_f64;
167 case fmul64: return aco_opcode::v_mul_f64;
168 case imin64: return aco_opcode::num_opcodes;
169 case imax64: return aco_opcode::num_opcodes;
170 case umin64: return aco_opcode::num_opcodes;
171 case umax64: return aco_opcode::num_opcodes;
172 case fmin64: return aco_opcode::v_min_f64;
173 case fmax64: return aco_opcode::v_max_f64;
174 case iand64: return aco_opcode::num_opcodes;
175 case ior64: return aco_opcode::num_opcodes;
176 case ixor64: return aco_opcode::num_opcodes;
177 default: return aco_opcode::num_opcodes;
178 }
179 }
180
181 bool is_vop3_reduce_opcode(aco_opcode opcode)
182 {
183 /* 64-bit reductions are VOP3. */
184 if (opcode == aco_opcode::num_opcodes)
185 return true;
186
187 return instr_info.format[(int)opcode] == Format::VOP3;
188 }
189
190 void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
191 {
192 Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
193 if (instr->definitions.size() >= 2) {
194 assert(instr->definitions[1].regClass() == bld.lm);
195 instr->definitions[1].setFixed(vcc);
196 }
197 }
198
199 void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
200 PhysReg vtmp_reg, ReduceOp op,
201 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl,
202 Operand *identity=NULL)
203 {
204 Builder bld(ctx->program, &ctx->instructions);
205 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
206 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg+1}, v1)};
207 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg+1}, v1)};
208 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
209 Operand src1_64 = Operand(src1_reg, v2);
210 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg+1}, v1)};
211 Operand vtmp_op64 = Operand(vtmp_reg, v2);
212 if (op == iadd64) {
213 if (ctx->program->chip_class >= GFX10) {
214 if (identity)
215 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
216 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
217 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
218 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
219 } else {
220 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
221 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
222 }
223 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm),
224 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
225 } else if (op == iand64) {
226 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0],
227 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
228 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1],
229 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
230 } else if (op == ior64) {
231 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0],
232 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
233 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1],
234 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
235 } else if (op == ixor64) {
236 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0],
237 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
238 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1],
239 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
240 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
241 aco_opcode cmp = aco_opcode::num_opcodes;
242 switch (op) {
243 case umin64:
244 cmp = aco_opcode::v_cmp_gt_u64;
245 break;
246 case umax64:
247 cmp = aco_opcode::v_cmp_lt_u64;
248 break;
249 case imin64:
250 cmp = aco_opcode::v_cmp_gt_i64;
251 break;
252 case imax64:
253 cmp = aco_opcode::v_cmp_lt_i64;
254 break;
255 default:
256 break;
257 }
258
259 if (identity) {
260 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
261 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
262 }
263 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
264 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
265 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1],
266 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
267
268 bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
269 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
270 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
271 } else if (op == imul64) {
272 /* t4 = dpp(x_hi)
273 * t1 = umul_lo(t4, y_lo)
274 * t3 = dpp(x_lo)
275 * t0 = umul_lo(t3, y_hi)
276 * t2 = iadd(t0, t1)
277 * t5 = umul_hi(t3, y_lo)
278 * res_hi = iadd(t2, t5)
279 * res_lo = umul_lo(t3, y_lo)
280 * Requires that res_hi != src0[0] and res_hi != src1[0]
281 * and that vtmp[0] != res_hi.
282 */
283 if (identity)
284 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
285 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1],
286 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
287 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
288 if (identity)
289 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
290 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
291 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
292 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
293 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
294 if (identity)
295 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
296 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
297 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
298 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
299 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
300 if (identity)
301 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
302 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
303 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
304 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
305 }
306 }
307
308 void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, ReduceOp op)
309 {
310 Builder bld(ctx->program, &ctx->instructions);
311 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
312 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
313 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg+1}, src0_rc)};
314 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
315 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
316 Operand src1_64 = Operand(src1_reg, v2);
317
318 if (src0_rc == s1 &&
319 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
320 assert(vtmp.reg() != 0);
321 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
322 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
323 src0_reg = vtmp;
324 src0[0] = Operand(vtmp, v1);
325 src0[1] = Operand(PhysReg{vtmp+1}, v1);
326 src0_64 = Operand(vtmp, v2);
327 } else if (src0_rc == s1 && op == iadd64) {
328 assert(vtmp.reg() != 0);
329 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
330 src0[1] = Operand(PhysReg{vtmp+1}, v1);
331 }
332
333 if (op == iadd64) {
334 if (ctx->program->chip_class >= GFX10) {
335 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
336 } else {
337 bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
338 }
339 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm));
340 } else if (op == iand64) {
341 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
342 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
343 } else if (op == ior64) {
344 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
345 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
346 } else if (op == ixor64) {
347 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
348 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
349 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
350 aco_opcode cmp = aco_opcode::num_opcodes;
351 switch (op) {
352 case umin64:
353 cmp = aco_opcode::v_cmp_gt_u64;
354 break;
355 case umax64:
356 cmp = aco_opcode::v_cmp_lt_u64;
357 break;
358 case imin64:
359 cmp = aco_opcode::v_cmp_gt_i64;
360 break;
361 case imax64:
362 cmp = aco_opcode::v_cmp_lt_i64;
363 break;
364 default:
365 break;
366 }
367
368 bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
369 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
370 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
371 } else if (op == imul64) {
372 if (src1_reg == dst_reg) {
373 /* it's fine if src0==dst but not if src1==dst */
374 std::swap(src0_reg, src1_reg);
375 std::swap(src0[0], src1[0]);
376 std::swap(src0[1], src1[1]);
377 std::swap(src0_64, src1_64);
378 }
379 assert(!(src0_reg == src1_reg));
380 /* t1 = umul_lo(x_hi, y_lo)
381 * t0 = umul_lo(x_lo, y_hi)
382 * t2 = iadd(t0, t1)
383 * t5 = umul_hi(x_lo, y_lo)
384 * res_hi = iadd(t2, t5)
385 * res_lo = umul_lo(x_lo, y_lo)
386 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
387 */
388 Definition tmp0_def(PhysReg{src0_reg+1}, v1);
389 Definition tmp1_def(PhysReg{src1_reg+1}, v1);
390 Operand tmp0_op = src0[1];
391 Operand tmp1_op = src1[1];
392 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
393 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
394 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
395 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
396 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
397 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
398 }
399 }
400
401 void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
402 PhysReg vtmp, ReduceOp op, unsigned size,
403 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl,
404 Operand *identity=NULL) /* for VOP3 with sparse writes */
405 {
406 Builder bld(ctx->program, &ctx->instructions);
407 RegClass rc = RegClass(RegType::vgpr, size);
408 Definition dst(dst_reg, rc);
409 Operand src0(src0_reg, rc);
410 Operand src1(src1_reg, rc);
411
412 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
413 bool vop3 = is_vop3_reduce_opcode(opcode);
414
415 if (!vop3) {
416 if (opcode == aco_opcode::v_add_co_u32)
417 bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
418 else
419 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
420 return;
421 }
422
423 if (opcode == aco_opcode::num_opcodes) {
424 emit_int64_dpp_op(ctx, dst_reg ,src0_reg, src1_reg, vtmp, op,
425 dpp_ctrl, row_mask, bank_mask, bound_ctrl, identity);
426 return;
427 }
428
429 if (identity)
430 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
431 if (identity && size >= 2)
432 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]);
433
434 for (unsigned i = 0; i < size; i++)
435 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0_reg+i}, v1),
436 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
437
438 bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
439 }
440
441 void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
442 PhysReg vtmp, ReduceOp op, unsigned size)
443 {
444 Builder bld(ctx->program, &ctx->instructions);
445 RegClass rc = RegClass(RegType::vgpr, size);
446 Definition dst(dst_reg, rc);
447 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
448 Operand src1(src1_reg, rc);
449
450 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
451 bool vop3 = is_vop3_reduce_opcode(opcode);
452
453 if (opcode == aco_opcode::num_opcodes) {
454 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
455 return;
456 }
457
458 if (vop3) {
459 bld.vop3(opcode, dst, src0, src1);
460 } else if (opcode == aco_opcode::v_add_co_u32) {
461 bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
462 } else {
463 bld.vop2(opcode, dst, src0, src1);
464 }
465 }
466
467 void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size,
468 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
469 {
470 Builder bld(ctx->program, &ctx->instructions);
471 for (unsigned i = 0; i < size; i++) {
472 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst+i}, v1), Operand(PhysReg{src0+i}, v1),
473 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
474 }
475 }
476
477 uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
478 {
479 switch (op) {
480 case iadd8:
481 case iadd16:
482 case iadd32:
483 case iadd64:
484 case fadd16:
485 case fadd32:
486 case fadd64:
487 case ior8:
488 case ior16:
489 case ior32:
490 case ior64:
491 case ixor8:
492 case ixor16:
493 case ixor32:
494 case ixor64:
495 case umax8:
496 case umax16:
497 case umax32:
498 case umax64:
499 return 0;
500 case imul8:
501 case imul16:
502 case imul32:
503 case imul64:
504 return idx ? 0 : 1;
505 case fmul16:
506 return 0x3c00u; /* 1.0 */
507 case fmul32:
508 return 0x3f800000u; /* 1.0 */
509 case fmul64:
510 return idx ? 0x3ff00000u : 0u; /* 1.0 */
511 case imin8:
512 return INT8_MAX;
513 case imin16:
514 return INT16_MAX;
515 case imin32:
516 return INT32_MAX;
517 case imin64:
518 return idx ? 0x7fffffffu : 0xffffffffu;
519 case imax8:
520 return INT8_MIN;
521 case imax16:
522 return INT16_MIN;
523 case imax32:
524 return INT32_MIN;
525 case imax64:
526 return idx ? 0x80000000u : 0;
527 case umin8:
528 case umin16:
529 case iand8:
530 case iand16:
531 return 0xffffffffu;
532 case umin32:
533 case umin64:
534 case iand32:
535 case iand64:
536 return 0xffffffffu;
537 case fmin16:
538 return 0x7c00u; /* infinity */
539 case fmin32:
540 return 0x7f800000u; /* infinity */
541 case fmin64:
542 return idx ? 0x7ff00000u : 0u; /* infinity */
543 case fmax16:
544 return 0xfc00u; /* negative infinity */
545 case fmax32:
546 return 0xff800000u; /* negative infinity */
547 case fmax64:
548 return idx ? 0xfff00000u : 0u; /* negative infinity */
549 default:
550 unreachable("Invalid reduction operation");
551 break;
552 }
553 return 0;
554 }
555
556 void emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
557 {
558 for (unsigned i = 0; i < size; i++) {
559 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst+i}, v1),
560 Operand(PhysReg{src+i}, v1), ds_pattern);
561 }
562 }
563
564 void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp,
565 PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
566 {
567 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
568 assert(cluster_size <= ctx->program->wave_size);
569
570 Builder bld(ctx->program, &ctx->instructions);
571
572 Operand identity[2];
573 identity[0] = Operand(get_reduction_identity(reduce_op, 0));
574 identity[1] = Operand(get_reduction_identity(reduce_op, 1));
575 Operand vcndmask_identity[2] = {identity[0], identity[1]};
576
577 /* First, copy the source to tmp and set inactive lanes to the identity */
578 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm));
579
580 for (unsigned i = 0; i < src.size(); i++) {
581 /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
582 * except on GFX10, where v_writelane_b32 can take a literal. */
583 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
584 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
585 identity[i] = Operand(PhysReg{sitmp+i}, s1);
586
587 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
588 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
589 } else if (identity[i].isLiteral()) {
590 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
591 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
592 }
593 }
594
595 for (unsigned i = 0; i < src.size(); i++) {
596 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
597 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
598 Operand(stmp, bld.lm));
599 }
600
601 if (src.regClass() == v1b) {
602 if (ctx->program->chip_class >= GFX8) {
603 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
604 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
605 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
606 if (reduce_op == imin8 || reduce_op == imax8)
607 sdwa->sel[0] = sdwa_sbyte;
608 else
609 sdwa->sel[0] = sdwa_ubyte;
610 sdwa->dst_sel = sdwa_udword;
611 bld.insert(std::move(sdwa));
612 } else {
613 aco_opcode opcode;
614
615 if (reduce_op == imin8 || reduce_op == imax8)
616 opcode = aco_opcode::v_bfe_i32;
617 else
618 opcode = aco_opcode::v_bfe_u32;
619
620 bld.vop3(opcode, Definition(PhysReg{tmp}, v1),
621 Operand(PhysReg{tmp}, v1), Operand(0u), Operand(8u));
622 }
623 } else if (src.regClass() == v2b) {
624 if (ctx->program->chip_class >= GFX10 &&
625 (reduce_op == iadd16 || reduce_op == imax16 ||
626 reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16)) {
627 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
628 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
629 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
630 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
631 sdwa->sel[0] = sdwa_sword;
632 else
633 sdwa->sel[0] = sdwa_uword;
634 sdwa->dst_sel = sdwa_udword;
635 bld.insert(std::move(sdwa));
636 } else if (ctx->program->chip_class == GFX6 || ctx->program->chip_class == GFX7) {
637 aco_opcode opcode;
638
639 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
640 opcode = aco_opcode::v_bfe_i32;
641 else
642 opcode = aco_opcode::v_bfe_u32;
643
644 bld.vop3(opcode, Definition(PhysReg{tmp}, v1),
645 Operand(PhysReg{tmp}, v1), Operand(0u), Operand(16u));
646 }
647 }
648
649 bool reduction_needs_last_op = false;
650 switch (op) {
651 case aco_opcode::p_reduce:
652 if (cluster_size == 1) break;
653
654 if (ctx->program->chip_class <= GFX7) {
655 reduction_needs_last_op = true;
656 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
657 if (cluster_size == 2) break;
658 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
659 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
660 if (cluster_size == 4) break;
661 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
662 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
663 if (cluster_size == 8) break;
664 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
665 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
666 if (cluster_size == 16) break;
667 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
668 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
669 if (cluster_size == 32) break;
670 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
671 for (unsigned i = 0; i < src.size(); i++)
672 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), Operand(0u));
673 // TODO: it would be more effective to do the last reduction step on SALU
674 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
675 reduction_needs_last_op = false;
676 break;
677 }
678
679 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false);
680 if (cluster_size == 2) break;
681 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false);
682 if (cluster_size == 4) break;
683 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf, false);
684 if (cluster_size == 8) break;
685 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
686 if (cluster_size == 16) break;
687
688 if (ctx->program->chip_class >= GFX10) {
689 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
690 for (unsigned i = 0; i < src.size(); i++)
691 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
692
693 if (cluster_size == 32) {
694 reduction_needs_last_op = true;
695 break;
696 }
697
698 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
699 for (unsigned i = 0; i < src.size(); i++)
700 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
701 // TODO: it would be more effective to do the last reduction step on SALU
702 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
703 break;
704 }
705
706 if (cluster_size == 32) {
707 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
708 reduction_needs_last_op = true;
709 break;
710 }
711 assert(cluster_size == 64);
712 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, false);
713 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, false);
714 break;
715 case aco_opcode::p_exclusive_scan:
716 if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
717 /* shift rows right */
718 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
719
720 /* fill in the gaps in rows 1 and 3 */
721 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
722 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
723 for (unsigned i = 0; i < src.size(); i++) {
724 Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
725 Definition(PhysReg{vtmp+i}, v1),
726 Operand(PhysReg{tmp+i}, v1),
727 Operand(0xffffffffu), Operand(0xffffffffu)).instr;
728 static_cast<VOP3A_instruction*>(perm)->opsel = 1; /* FI (Fetch Inactive) */
729 }
730 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX));
731
732 if (ctx->program->wave_size == 64) {
733 /* fill in the gap in row 2 */
734 for (unsigned i = 0; i < src.size(); i++) {
735 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
736 bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
737 }
738 }
739 std::swap(tmp, vtmp);
740 } else if (ctx->program->chip_class >= GFX8) {
741 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
742 } else {
743 // TODO: use LDS on CS with a single write and shifted read
744 /* wavefront shift_right by 1 on SI/CI */
745 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
746 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
747 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10101010u));
748 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
749 for (unsigned i = 0; i < src.size(); i++)
750 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
751
752 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
753 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
754 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x01000100u));
755 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
756 for (unsigned i = 0; i < src.size(); i++)
757 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
758
759 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
760 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
761 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(1u), Operand(16u));
762 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(1u), Operand(16u));
763 for (unsigned i = 0; i < src.size(); i++)
764 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
765
766 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
767 for (unsigned i = 0; i < src.size(); i++) {
768 bld.writelane(Definition(PhysReg{vtmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{vtmp+i}, v1));
769 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
770 bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
771 identity[i] = Operand(0u); /* prevent further uses of identity */
772 }
773 std::swap(tmp, vtmp);
774 }
775
776 for (unsigned i = 0; i < src.size(); i++) {
777 if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
778 if (ctx->program->chip_class < GFX10)
779 assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
780 bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));
781 }
782 }
783 /* fall through */
784 case aco_opcode::p_inclusive_scan:
785 assert(cluster_size == ctx->program->wave_size);
786 if (ctx->program->chip_class <= GFX7) {
787 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
788 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xAAAAAAAAu));
789 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
790 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
791
792 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
793 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
794 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xCCCCCCCCu));
795 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
796 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
797
798 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
799 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
800 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xF0F0F0F0u));
801 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
802 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
803
804 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
805 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
806 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xFF00FF00u));
807 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
808 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
809
810 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
811 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
812 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u));
813 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u));
814 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
815
816 for (unsigned i = 0; i < src.size(); i++)
817 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
818 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
819 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
820 break;
821 }
822
823 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
824 dpp_row_sr(1), 0xf, 0xf, false, identity);
825 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
826 dpp_row_sr(2), 0xf, 0xf, false, identity);
827 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
828 dpp_row_sr(4), 0xf, 0xf, false, identity);
829 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
830 dpp_row_sr(8), 0xf, 0xf, false, identity);
831 if (ctx->program->chip_class >= GFX10) {
832 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u));
833 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u));
834 for (unsigned i = 0; i < src.size(); i++) {
835 Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
836 Definition(PhysReg{vtmp+i}, v1),
837 Operand(PhysReg{tmp+i}, v1),
838 Operand(0xffffffffu), Operand(0xffffffffu)).instr;
839 static_cast<VOP3A_instruction*>(perm)->opsel = 1; /* FI (Fetch Inactive) */
840 }
841 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
842
843 if (ctx->program->wave_size == 64) {
844 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
845 for (unsigned i = 0; i < src.size(); i++)
846 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
847 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
848 }
849 } else {
850 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
851 dpp_row_bcast15, 0xa, 0xf, false, identity);
852 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
853 dpp_row_bcast31, 0xc, 0xf, false, identity);
854 }
855 break;
856 default:
857 unreachable("Invalid reduction mode");
858 }
859
860
861 if (op == aco_opcode::p_reduce) {
862 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
863 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
864 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
865 return;
866 }
867
868 if (reduction_needs_last_op)
869 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
870 }
871
872 /* restore exec */
873 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
874
875 if (dst.regClass().type() == RegType::sgpr) {
876 for (unsigned k = 0; k < src.size(); k++) {
877 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1),
878 Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
879 }
880 } else if (dst.physReg() != tmp) {
881 for (unsigned k = 0; k < src.size(); k++) {
882 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
883 Operand(PhysReg{tmp + k}, v1));
884 }
885 }
886 }
887
888 void emit_gfx10_wave64_bpermute(Program *program, aco_ptr<Instruction> &instr, Builder &bld)
889 {
890 /* Emulates proper bpermute on GFX10 in wave64 mode.
891 *
892 * This is necessary because on GFX10 the bpermute instruction only works
893 * on half waves (you can think of it as having a cluster size of 32), so we
894 * manually swap the data between the two halves using two shared VGPRs.
895 */
896
897 assert(program->chip_class >= GFX10);
898 assert(program->info->wave_size == 64);
899
900 unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;
901 Definition dst = instr->definitions[0];
902 Definition tmp_exec = instr->definitions[1];
903 Definition clobber_scc = instr->definitions[2];
904 Operand index_x4 = instr->operands[0];
905 Operand input_data = instr->operands[1];
906 Operand same_half = instr->operands[2];
907
908 assert(dst.regClass() == v1);
909 assert(tmp_exec.regClass() == bld.lm);
910 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
911 assert(same_half.regClass() == bld.lm);
912 assert(index_x4.regClass() == v1);
913 assert(input_data.regClass().type() == RegType::vgpr);
914 assert(input_data.bytes() <= 4);
915 assert(dst.physReg() != index_x4.physReg());
916 assert(dst.physReg() != input_data.physReg());
917 assert(tmp_exec.physReg() != same_half.physReg());
918
919 PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
920 PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
921
922 /* Permute the input within the same half-wave */
923 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
924
925 /* HI: Copy data from high lanes 32-63 to shared vgpr */
926 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
927 /* Save EXEC */
928 bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
929 /* Set EXEC to enable LO lanes only */
930 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(0u));
931 /* LO: Copy data from low lanes 0-31 to shared vgpr */
932 bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
933 /* LO: bpermute shared vgpr (high lanes' data) */
934 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4, Operand(shared_vgpr_hi, v1));
935 /* Set EXEC to enable HI lanes only */
936 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
937 /* HI: bpermute shared vgpr (low lanes' data) */
938 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4, Operand(shared_vgpr_lo, v1));
939
940 /* Only enable lanes which use the other half's data */
941 bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc, Operand(tmp_exec.physReg(), s2), same_half);
942 /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
943 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false);
944 /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
945 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
946
947 /* Restore saved EXEC */
948 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
949
950 /* RA assumes that the result is always in the low part of the register, so we have to shift, if it's not there already */
951 if (input_data.physReg().byte()) {
952 unsigned right_shift = input_data.physReg().byte() * 8;
953 bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand(right_shift), Operand(dst.physReg(), v1));
954 }
955 }
956
957 void emit_gfx6_bpermute(Program *program, aco_ptr<Instruction> &instr, Builder &bld)
958 {
959 /* Emulates bpermute using readlane instructions */
960
961 Operand index = instr->operands[0];
962 Operand input = instr->operands[1];
963 Definition dst = instr->definitions[0];
964 Definition temp_exec = instr->definitions[1];
965 Definition clobber_vcc = instr->definitions[2];
966
967 assert(dst.regClass() == v1);
968 assert(temp_exec.regClass() == bld.lm);
969 assert(clobber_vcc.regClass() == bld.lm);
970 assert(clobber_vcc.physReg() == vcc);
971 assert(index.regClass() == v1);
972 assert(index.physReg() != dst.physReg());
973 assert(input.regClass().type() == RegType::vgpr);
974 assert(input.bytes() <= 4);
975 assert(input.physReg() != dst.physReg());
976
977 /* Save original EXEC */
978 bld.sop1(aco_opcode::s_mov_b64, temp_exec, Operand(exec, s2));
979
980 /* An "unrolled loop" that is executed per each lane.
981 * This takes only a few instructions per lane, as opposed to a "real" loop
982 * with branching, where the branch instruction alone would take 16+ cycles.
983 */
984 for (unsigned n = 0; n < program->wave_size; ++n) {
985 /* Activate the lane which has N for its source index */
986 bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), clobber_vcc, Operand(n), index);
987 /* Read the data from lane N */
988 bld.readlane(Definition(vcc, s1), input, Operand(n));
989 /* On the active lane, move the data we read from lane N to the destination VGPR */
990 bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
991 /* Restore original EXEC */
992 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(temp_exec.physReg(), s2));
993 }
994 }
995
996 struct copy_operation {
997 Operand op;
998 Definition def;
999 unsigned bytes;
1000 union {
1001 uint8_t uses[8];
1002 uint64_t is_used = 0;
1003 };
1004 };
1005
1006 void split_copy(unsigned offset, Definition *def, Operand *op, const copy_operation& src, bool ignore_uses, unsigned max_size)
1007 {
1008 PhysReg def_reg = src.def.physReg();
1009 PhysReg op_reg = src.op.physReg();
1010 def_reg.reg_b += offset;
1011 op_reg.reg_b += offset;
1012
1013 max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8);
1014
1015 /* make sure the size is a power of two and reg % bytes == 0 */
1016 unsigned bytes = 1;
1017 for (; bytes <= max_size; bytes *= 2) {
1018 unsigned next = bytes * 2u;
1019 bool can_increase = def_reg.reg_b % next == 0 &&
1020 offset + next <= src.bytes && next <= max_size;
1021 if (!src.op.isConstant() && can_increase)
1022 can_increase = op_reg.reg_b % next == 0;
1023 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1024 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1025 if (!can_increase)
1026 break;
1027 }
1028
1029 RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u) :
1030 RegClass(src.def.regClass().type(), bytes).as_subdword();
1031 *def = Definition(src.def.tempId(), def_reg, def_cls);
1032 if (src.op.isConstant()) {
1033 assert(bytes >= 1 && bytes <= 8);
1034 if (bytes == 8)
1035 *op = Operand(src.op.constantValue64() >> (offset * 8u));
1036 else if (bytes == 4)
1037 *op = Operand(uint32_t(src.op.constantValue64() >> (offset * 8u)));
1038 else if (bytes == 2)
1039 *op = Operand(uint16_t(src.op.constantValue64() >> (offset * 8u)));
1040 else if (bytes == 1)
1041 *op = Operand(uint8_t(src.op.constantValue64() >> (offset * 8u)));
1042 } else {
1043 RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) :
1044 RegClass(src.op.regClass().type(), bytes).as_subdword();
1045 *op = Operand(op_reg, op_cls);
1046 op->setTemp(Temp(src.op.tempId(), op_cls));
1047 }
1048 }
1049
1050 uint32_t get_intersection_mask(int a_start, int a_size,
1051 int b_start, int b_size)
1052 {
1053 int intersection_start = MAX2(b_start - a_start, 0);
1054 int intersection_end = MAX2(b_start + b_size - a_start, 0);
1055 if (intersection_start >= a_size || intersection_end == 0)
1056 return 0;
1057
1058 uint32_t mask = u_bit_consecutive(0, a_size);
1059 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1060 }
1061
1062 bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr)
1063 {
1064 bool did_copy = false;
1065 for (unsigned offset = 0; offset < copy.bytes;) {
1066 if (copy.uses[offset]) {
1067 offset++;
1068 continue;
1069 }
1070
1071 Definition def;
1072 Operand op;
1073 split_copy(offset, &def, &op, copy, false, 8);
1074
1075 if (def.physReg() == scc) {
1076 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand(0u));
1077 *preserve_scc = true;
1078 } else if (def.bytes() == 8 && def.getTemp().type() == RegType::sgpr) {
1079 bld.sop1(aco_opcode::s_mov_b64, def, Operand(op.physReg(), s2));
1080 } else if (def.regClass().is_subdword() && ctx->program->chip_class < GFX8) {
1081 if (op.physReg().byte()) {
1082 assert(def.physReg().byte() == 0);
1083 bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand(op.physReg().byte() * 8), op);
1084 } else if (def.physReg().byte() == 2) {
1085 assert(op.physReg().byte() == 0);
1086 /* preserve the target's lower half */
1087 def = Definition(def.physReg().advance(-2), v1);
1088 bld.vop2(aco_opcode::v_and_b32, Definition(op.physReg(), v1), Operand(0xFFFFu), op);
1089 if (def.physReg().reg() != op.physReg().reg())
1090 bld.vop2(aco_opcode::v_and_b32, def, Operand(0xFFFFu), Operand(def.physReg(), v2b));
1091 bld.vop2(aco_opcode::v_cvt_pk_u16_u32, def, Operand(def.physReg(), v2b), op);
1092 } else if (def.physReg().byte()) {
1093 unsigned bits = def.physReg().byte() * 8;
1094 assert(op.physReg().byte() == 0);
1095 def = Definition(def.physReg().advance(-def.physReg().byte()), v1);
1096 bld.vop2(aco_opcode::v_and_b32, def, Operand((1 << bits) - 1u), Operand(def.physReg(), op.regClass()));
1097 if (def.physReg().reg() == op.physReg().reg()) {
1098 if (bits < 24) {
1099 bld.vop2(aco_opcode::v_mul_u32_u24, def, Operand((1 << bits) + 1u), op);
1100 } else {
1101 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
1102 bld.vop3(aco_opcode::v_mul_lo_u32, def, Operand(scratch_sgpr, s1), op);
1103 }
1104 } else {
1105 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
1106 bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op);
1107 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
1108 }
1109 } else {
1110 bld.vop1(aco_opcode::v_mov_b32, def, op);
1111 }
1112 } else {
1113 bld.copy(def, op);
1114 }
1115
1116 did_copy = true;
1117 offset += def.bytes();
1118 }
1119 return did_copy;
1120 }
1121
1122 void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, Pseudo_instruction *pi)
1123 {
1124 unsigned offset = 0;
1125
1126 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1127 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1128 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte swap */
1129 PhysReg op = copy.op.physReg();
1130 PhysReg def = copy.def.physReg();
1131 op.reg_b &= ~0x3;
1132 def.reg_b &= ~0x3;
1133
1134 copy_operation tmp;
1135 tmp.op = Operand(op, v1);
1136 tmp.def = Definition(def, v1);
1137 tmp.bytes = 4;
1138 memset(tmp.uses, 1, 4);
1139 do_swap(ctx, bld, tmp, preserve_scc, pi);
1140
1141 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1142 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1143 tmp.op = Operand(op, v1b);
1144 tmp.def = Definition(def, v1b);
1145 tmp.bytes = 1;
1146 tmp.uses[0] = 1;
1147 do_swap(ctx, bld, tmp, preserve_scc, pi);
1148
1149 offset = copy.bytes;
1150 }
1151
1152 for (; offset < copy.bytes;) {
1153 Definition def;
1154 Operand op;
1155 split_copy(offset, &def, &op, copy, true, 8);
1156
1157 assert(op.regClass() == def.regClass());
1158 Operand def_as_op = Operand(def.physReg(), def.regClass());
1159 Definition op_as_def = Definition(op.physReg(), op.regClass());
1160 if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) {
1161 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1162 } else if (def.regClass() == v1 || (def.regClass().is_subdword() && ctx->program->chip_class < GFX8)) {
1163 assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1164 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1165 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1166 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1167 } else if (op.physReg() == scc || def.physReg() == scc) {
1168 /* we need to swap scc and another sgpr */
1169 assert(!preserve_scc);
1170
1171 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1172
1173 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1174 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u));
1175 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1176 } else if (def.regClass() == s1) {
1177 if (preserve_scc) {
1178 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1179 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1180 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1181 } else {
1182 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1183 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1184 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1185 }
1186 } else if (def.regClass() == s2) {
1187 if (preserve_scc)
1188 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1189 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1190 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1191 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1192 if (preserve_scc)
1193 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), Operand(0u));
1194 } else if (ctx->program->chip_class >= GFX9 && def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1195 aco_ptr<VOP3P_instruction> vop3p{create_instruction<VOP3P_instruction>(aco_opcode::v_pk_add_u16, Format::VOP3P, 2, 1)};
1196 vop3p->operands[0] = Operand(PhysReg{op.physReg().reg()}, v1);
1197 vop3p->operands[1] = Operand(0u);
1198 vop3p->definitions[0] = Definition(PhysReg{op.physReg().reg()}, v1);
1199 vop3p->opsel_lo = 0x1;
1200 vop3p->opsel_hi = 0x2;
1201 bld.insert(std::move(vop3p));
1202 } else {
1203 assert(def.regClass().is_subdword());
1204 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1205 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1206 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1207 }
1208
1209 offset += def.bytes();
1210 }
1211
1212 if (ctx->program->chip_class <= GFX7)
1213 return;
1214
1215 /* fixup in case we swapped bytes we shouldn't have */
1216 copy_operation tmp_copy = copy;
1217 tmp_copy.op.setFixed(copy.def.physReg());
1218 tmp_copy.def.setFixed(copy.op.physReg());
1219 do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1220 }
1221
1222 void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1223 {
1224 if (ctx->program->chip_class >= GFX9) {
1225 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1226 /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1227 static_cast<VOP3A_instruction*>(instr)->opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1228 } else if (ctx->program->chip_class >= GFX8) {
1229 // TODO: optimize with v_mov_b32 / v_lshlrev_b32
1230 PhysReg reg = def.physReg();
1231 bld.copy(Definition(reg, v2b), lo);
1232 reg.reg_b += 2;
1233 bld.copy(Definition(reg, v2b), hi);
1234 } else {
1235 assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 0);
1236 bld.vop2(aco_opcode::v_and_b32, Definition(lo.physReg(), v1), Operand(0xFFFFu), lo);
1237 bld.vop2(aco_opcode::v_and_b32, Definition(hi.physReg(), v1), Operand(0xFFFFu), hi);
1238 bld.vop2(aco_opcode::v_cvt_pk_u16_u32, def, lo, hi);
1239 }
1240 }
1241
1242 void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi)
1243 {
1244 Builder bld(ctx->program, &ctx->instructions);
1245 unsigned num_instructions_before = ctx->instructions.size();
1246 aco_ptr<Instruction> mov;
1247 std::map<PhysReg, copy_operation>::iterator it = copy_map.begin();
1248 std::map<PhysReg, copy_operation>::iterator target;
1249 bool writes_scc = false;
1250
1251 /* count the number of uses for each dst reg */
1252 while (it != copy_map.end()) {
1253
1254 if (it->second.def.physReg() == scc)
1255 writes_scc = true;
1256
1257 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1258
1259 /* if src and dst reg are the same, remove operation */
1260 if (it->first == it->second.op.physReg()) {
1261 it = copy_map.erase(it);
1262 continue;
1263 }
1264
1265 /* split large copies */
1266 if (it->second.bytes > 8) {
1267 assert(!it->second.op.isConstant());
1268 assert(!it->second.def.regClass().is_subdword());
1269 RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 2);
1270 Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1271 rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 2);
1272 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1273 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1274 copy_map[hi_def.physReg()] = copy;
1275 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1276 it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);
1277 it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);
1278 it->second.bytes = 8;
1279 }
1280
1281 /* try to coalesce copies */
1282 if (it->second.bytes < 8 && !it->second.op.isConstant() &&
1283 it->first.reg_b % util_next_power_of_two(it->second.bytes + 1) == 0 &&
1284 it->second.op.physReg().reg_b % util_next_power_of_two(it->second.bytes + 1) == 0) {
1285 // TODO try more relaxed alignment for subdword copies
1286 PhysReg other_def_reg = it->first;
1287 other_def_reg.reg_b += it->second.bytes;
1288 PhysReg other_op_reg = it->second.op.physReg();
1289 other_op_reg.reg_b += it->second.bytes;
1290 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(other_def_reg);
1291 if (other != copy_map.end() &&
1292 other->second.op.physReg() == other_op_reg &&
1293 it->second.bytes + other->second.bytes <= 8) {
1294 it->second.bytes += other->second.bytes;
1295 it->second.def = Definition(it->first, RegClass::get(it->second.def.regClass().type(), it->second.bytes));
1296 it->second.op = Operand(it->second.op.physReg(), RegClass::get(it->second.op.regClass().type(), it->second.bytes));
1297 copy_map.erase(other);
1298 }
1299 }
1300
1301 /* check if the definition reg is used by another copy operation */
1302 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1303 if (copy.second.op.isConstant())
1304 continue;
1305 for (uint16_t i = 0; i < it->second.bytes; i++) {
1306 /* distance might underflow */
1307 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1308 if (distance < copy.second.bytes)
1309 it->second.uses[i] += 1;
1310 }
1311 }
1312
1313 ++it;
1314 }
1315
1316 /* first, handle paths in the location transfer graph */
1317 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1318 bool skip_partial_copies = true;
1319 it = copy_map.begin();
1320 while (true) {
1321 if (copy_map.empty()) {
1322 ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before;
1323 return;
1324 }
1325 if (it == copy_map.end()) {
1326 if (!skip_partial_copies)
1327 break;
1328 skip_partial_copies = false;
1329 it = copy_map.begin();
1330 }
1331
1332 /* check if we can pack one register at once */
1333 if (it->first.byte() == 0 && it->second.bytes == 2) {
1334 PhysReg reg_hi = it->first.advance(2);
1335 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1336 if (other != copy_map.end() && other->second.bytes == 2) {
1337 /* check if the target register is otherwise unused */
1338 // TODO: also do this for self-intersecting registers
1339 bool unused_lo = !it->second.is_used;
1340 bool unused_hi = !other->second.is_used;
1341 if (unused_lo && unused_hi) {
1342 Operand lo = it->second.op;
1343 Operand hi = other->second.op;
1344 do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1345 copy_map.erase(it);
1346 copy_map.erase(other);
1347
1348 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1349 for (uint16_t i = 0; i < other.second.bytes; i++) {
1350 /* distance might underflow */
1351 unsigned distance_lo = other.first.reg_b + i - lo.physReg().reg_b;
1352 unsigned distance_hi = other.first.reg_b + i - hi.physReg().reg_b;
1353 if (distance_lo < 2 || distance_hi < 2)
1354 other.second.uses[i] -= 1;
1355 }
1356 }
1357 it = copy_map.begin();
1358 continue;
1359 }
1360 }
1361 }
1362
1363 /* on GFX6/7, we need some small workarounds as there is no
1364 * SDWA instruction to do partial register writes */
1365 if (ctx->program->chip_class < GFX8 && it->second.bytes < 4) {
1366 if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 &&
1367 !it->second.is_used && pi->opcode == aco_opcode::p_split_vector) {
1368 /* Other operations might overwrite the high bits, so change all users
1369 * of the high bits to the new target where they are still available.
1370 * This mechanism depends on also emitting dead definitions. */
1371 PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
1372 while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
1373 std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
1374 for (other = copy_map.begin(); other != copy_map.end(); other++) {
1375 /* on GFX6/7, if the high bits are used as operand, they cannot be a target */
1376 if (other->second.op.physReg() == reg_hi) {
1377 other->second.op.setFixed(it->first.advance(reg_hi.byte()));
1378 break; /* break because an operand can only be used once */
1379 }
1380 }
1381 reg_hi = reg_hi.advance(it->second.bytes);
1382 }
1383 } else if (it->first.byte()) {
1384 assert(pi->opcode == aco_opcode::p_create_vector);
1385 /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
1386 * move to the target operand's high bits. This is save to do as it cannot be an operand */
1387 PhysReg lo = PhysReg(it->first.reg());
1388 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
1389 if (other != copy_map.end()) {
1390 assert(other->second.bytes == it->first.byte());
1391 PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
1392 it->second.def = Definition(new_reg_hi, it->second.def.regClass());
1393 it->second.is_used = 0;
1394 other->second.bytes += it->second.bytes;
1395 other->second.def.setTemp(Temp(other->second.def.tempId(), RegClass::get(RegType::vgpr, other->second.bytes)));
1396 other->second.op.setTemp(Temp(other->second.op.tempId(), RegClass::get(RegType::vgpr, other->second.bytes)));
1397 /* if the new target's high bits are also a target, change uses */
1398 std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
1399 if (target != copy_map.end()) {
1400 for (unsigned i = 0; i < it->second.bytes; i++)
1401 target->second.uses[i]++;
1402 }
1403 }
1404 }
1405 }
1406
1407 /* find portions where the target reg is not used as operand for any other copy */
1408 if (it->second.is_used) {
1409 if (it->second.op.isConstant() || skip_partial_copies) {
1410 /* we have to skip constants until is_used=0.
1411 * we also skip partial copies at the beginning to help coalescing */
1412 ++it;
1413 continue;
1414 }
1415
1416 unsigned has_zero_use_bytes = 0;
1417 for (unsigned i = 0; i < it->second.bytes; i++)
1418 has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1419
1420 if (has_zero_use_bytes) {
1421 /* Skipping partial copying and doing a v_swap_b32 and then fixup
1422 * copies is usually beneficial for sub-dword copies, but if doing
1423 * a partial copy allows further copies, it should be done instead. */
1424 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1425 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1426 if (partial_copy)
1427 break;
1428 for (uint16_t i = 0; i < copy.second.bytes; i++) {
1429 /* distance might underflow */
1430 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1431 if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1432 !it->second.uses[distance])
1433 partial_copy = true;
1434 }
1435 }
1436
1437 if (!partial_copy) {
1438 ++it;
1439 continue;
1440 }
1441 } else {
1442 /* full target reg is used: register swapping needed */
1443 ++it;
1444 continue;
1445 }
1446 }
1447
1448 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1449 skip_partial_copies = did_copy;
1450 std::pair<PhysReg, copy_operation> copy = *it;
1451
1452 if (it->second.is_used == 0) {
1453 /* the target reg is not used as operand for any other copy, so we
1454 * copied to all of it */
1455 copy_map.erase(it);
1456 it = copy_map.begin();
1457 } else {
1458 /* we only performed some portions of this copy, so split it to only
1459 * leave the portions that still need to be done */
1460 copy_operation original = it->second; /* the map insertion below can overwrite this */
1461 copy_map.erase(it);
1462 for (unsigned offset = 0; offset < original.bytes;) {
1463 if (original.uses[offset] == 0) {
1464 offset++;
1465 continue;
1466 }
1467 Definition def;
1468 Operand op;
1469 split_copy(offset, &def, &op, original, false, 8);
1470
1471 copy_operation copy = {op, def, def.bytes()};
1472 for (unsigned i = 0; i < copy.bytes; i++)
1473 copy.uses[i] = original.uses[i + offset];
1474 copy_map[def.physReg()] = copy;
1475
1476 offset += def.bytes();
1477 }
1478
1479 it = copy_map.begin();
1480 }
1481
1482 /* Reduce the number of uses of the operand reg by one. Do this after
1483 * splitting the copy or removing it in case the copy writes to it's own
1484 * operand (for example, v[7:8] = v[8:9]) */
1485 if (did_copy && !copy.second.op.isConstant()) {
1486 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1487 for (uint16_t i = 0; i < other.second.bytes; i++) {
1488 /* distance might underflow */
1489 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
1490 if (distance < copy.second.bytes && !copy.second.uses[distance])
1491 other.second.uses[i] -= 1;
1492 }
1493 }
1494 }
1495 }
1496
1497 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
1498 unsigned largest = 0;
1499 for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
1500 largest = MAX2(largest, op.second.bytes);
1501
1502 while (!copy_map.empty()) {
1503
1504 /* Perform larger swaps first, because larger swaps swaps can make other
1505 * swaps unnecessary. */
1506 auto it = copy_map.begin();
1507 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
1508 if (it2->second.bytes > it->second.bytes) {
1509 it = it2;
1510 if (it->second.bytes == largest)
1511 break;
1512 }
1513 }
1514
1515 /* should already be done */
1516 assert(!it->second.op.isConstant());
1517
1518 assert(it->second.op.isFixed());
1519 assert(it->second.def.regClass() == it->second.op.regClass());
1520
1521 if (it->first == it->second.op.physReg()) {
1522 copy_map.erase(it);
1523 continue;
1524 }
1525
1526 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
1527 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
1528
1529 /* to resolve the cycle, we have to swap the src reg with the dst reg */
1530 copy_operation swap = it->second;
1531
1532 /* if this is self-intersecting, we have to split it because
1533 * self-intersecting swaps don't make sense */
1534 PhysReg lower = swap.def.physReg();
1535 PhysReg higher = swap.op.physReg();
1536 if (lower.reg_b > higher.reg_b)
1537 std::swap(lower, higher);
1538 if (higher.reg_b - lower.reg_b < (int)swap.bytes) {
1539 unsigned offset = higher.reg_b - lower.reg_b;
1540 RegType type = swap.def.regClass().type();
1541
1542 copy_operation middle;
1543 lower.reg_b += offset;
1544 higher.reg_b += offset;
1545 middle.bytes = swap.bytes - offset * 2;
1546 memcpy(middle.uses, swap.uses + offset, middle.bytes);
1547 middle.op = Operand(lower, RegClass::get(type, middle.bytes));
1548 middle.def = Definition(higher, RegClass::get(type, middle.bytes));
1549 copy_map[higher] = middle;
1550
1551 copy_operation end;
1552 lower.reg_b += middle.bytes;
1553 higher.reg_b += middle.bytes;
1554 end.bytes = swap.bytes - (offset + middle.bytes);
1555 memcpy(end.uses, swap.uses + offset + middle.bytes, end.bytes);
1556 end.op = Operand(lower, RegClass::get(type, end.bytes));
1557 end.def = Definition(higher, RegClass::get(type, end.bytes));
1558 copy_map[higher] = end;
1559
1560 memset(swap.uses + offset, 0, swap.bytes - offset);
1561 swap.bytes = offset;
1562 }
1563
1564 do_swap(ctx, bld, swap, preserve_scc, pi);
1565
1566 /* remove from map */
1567 copy_map.erase(it);
1568
1569 /* change the operand reg of the target's uses and split uses if needed */
1570 target = copy_map.begin();
1571 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
1572 for (; target != copy_map.end(); ++target) {
1573 if (target->second.op.physReg() == swap.def.physReg() && swap.bytes == target->second.bytes) {
1574 target->second.op.setFixed(swap.op.physReg());
1575 break;
1576 }
1577
1578 uint32_t imask = get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
1579 target->second.op.physReg().reg_b, target->second.bytes);
1580
1581 if (!imask)
1582 continue;
1583
1584 assert(target->second.bytes < swap.bytes);
1585
1586 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
1587
1588 /* split and update the middle (the portion that reads the swap's
1589 * definition) to read the swap's operand instead */
1590 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
1591 int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
1592 int before_bytes = MAX2(-offset, 0);
1593 int after_bytes = MAX2(target_op_end - swap_def_end, 0);
1594 int middle_bytes = target->second.bytes - before_bytes - after_bytes;
1595
1596 if (after_bytes) {
1597 unsigned after_offset = before_bytes + middle_bytes;
1598 assert(after_offset > 0);
1599 copy_operation copy;
1600 copy.bytes = after_bytes;
1601 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
1602 RegClass rc = RegClass::get(target->second.op.regClass().type(), after_bytes);
1603 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
1604 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
1605 copy_map[copy.def.physReg()] = copy;
1606 }
1607
1608 if (middle_bytes) {
1609 copy_operation copy;
1610 copy.bytes = middle_bytes;
1611 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
1612 RegClass rc = RegClass::get(target->second.op.regClass().type(), middle_bytes);
1613 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
1614 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
1615 copy_map[copy.def.physReg()] = copy;
1616 }
1617
1618 if (before_bytes) {
1619 copy_operation copy;
1620 target->second.bytes = before_bytes;
1621 RegClass rc = RegClass::get(target->second.op.regClass().type(), before_bytes);
1622 target->second.op = Operand(target->second.op.physReg(), rc);
1623 target->second.def = Definition(target->second.def.physReg(), rc);
1624 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
1625 }
1626
1627 /* break early since we know each byte of the swap's definition is used
1628 * at most once */
1629 bytes_left &= ~imask;
1630 if (!bytes_left)
1631 break;
1632 }
1633 }
1634 ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before;
1635 }
1636
1637 void lower_to_hw_instr(Program* program)
1638 {
1639 Block *discard_block = NULL;
1640
1641 for (size_t i = 0; i < program->blocks.size(); i++)
1642 {
1643 Block *block = &program->blocks[i];
1644 lower_context ctx;
1645 ctx.program = program;
1646 Builder bld(program, &ctx.instructions);
1647
1648 bool set_mode = i == 0 && block->fp_mode.val != program->config->float_mode;
1649 for (unsigned pred : block->linear_preds) {
1650 if (program->blocks[pred].fp_mode.val != block->fp_mode.val) {
1651 set_mode = true;
1652 break;
1653 }
1654 }
1655 if (set_mode) {
1656 /* only allow changing modes at top-level blocks so this doesn't break
1657 * the "jump over empty blocks" optimization */
1658 assert(block->kind & block_kind_top_level);
1659 uint32_t mode = block->fp_mode.val;
1660 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
1661 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(mode), (7 << 11) | 1);
1662 }
1663
1664 for (size_t j = 0; j < block->instructions.size(); j++) {
1665 aco_ptr<Instruction>& instr = block->instructions[j];
1666 aco_ptr<Instruction> mov;
1667 if (instr->format == Format::PSEUDO) {
1668 Pseudo_instruction *pi = (Pseudo_instruction*)instr.get();
1669
1670 switch (instr->opcode)
1671 {
1672 case aco_opcode::p_extract_vector:
1673 {
1674 PhysReg reg = instr->operands[0].physReg();
1675 Definition& def = instr->definitions[0];
1676 reg.reg_b += instr->operands[1].constantValue() * def.bytes();
1677
1678 if (reg == def.physReg())
1679 break;
1680
1681 RegClass op_rc = def.regClass().is_subdword() ? def.regClass() :
1682 RegClass(instr->operands[0].getTemp().type(), def.size());
1683 std::map<PhysReg, copy_operation> copy_operations;
1684 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
1685 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1686 break;
1687 }
1688 case aco_opcode::p_create_vector:
1689 {
1690 std::map<PhysReg, copy_operation> copy_operations;
1691 PhysReg reg = instr->definitions[0].physReg();
1692
1693 for (const Operand& op : instr->operands) {
1694 if (op.isConstant()) {
1695 const Definition def = Definition(reg, RegClass(instr->definitions[0].getTemp().type(), op.size()));
1696 copy_operations[reg] = {op, def, op.bytes()};
1697 reg.reg_b += op.bytes();
1698 continue;
1699 }
1700 if (op.isUndefined()) {
1701 // TODO: coalesce subdword copies if dst byte is 0
1702 reg.reg_b += op.bytes();
1703 continue;
1704 }
1705
1706 RegClass rc_def = op.regClass().is_subdword() ? op.regClass() :
1707 RegClass(instr->definitions[0].getTemp().type(), op.size());
1708 const Definition def = Definition(reg, rc_def);
1709 copy_operations[def.physReg()] = {op, def, op.bytes()};
1710 reg.reg_b += op.bytes();
1711 }
1712 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1713 break;
1714 }
1715 case aco_opcode::p_split_vector:
1716 {
1717 std::map<PhysReg, copy_operation> copy_operations;
1718 PhysReg reg = instr->operands[0].physReg();
1719
1720 for (const Definition& def : instr->definitions) {
1721 RegClass rc_op = def.regClass().is_subdword() ? def.regClass() :
1722 RegClass(instr->operands[0].getTemp().type(), def.size());
1723 const Operand op = Operand(reg, rc_op);
1724 copy_operations[def.physReg()] = {op, def, def.bytes()};
1725 reg.reg_b += def.bytes();
1726 }
1727 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1728 break;
1729 }
1730 case aco_opcode::p_parallelcopy:
1731 case aco_opcode::p_wqm:
1732 {
1733 std::map<PhysReg, copy_operation> copy_operations;
1734 for (unsigned i = 0; i < instr->operands.size(); i++) {
1735 assert(instr->definitions[i].bytes() == instr->operands[i].bytes());
1736 copy_operations[instr->definitions[i].physReg()] = {instr->operands[i], instr->definitions[i], instr->operands[i].bytes()};
1737 }
1738 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1739 break;
1740 }
1741 case aco_opcode::p_exit_early_if:
1742 {
1743 /* don't bother with an early exit near the end of the program */
1744 if ((block->instructions.size() - 1 - j) <= 4 &&
1745 block->instructions.back()->opcode == aco_opcode::s_endpgm) {
1746 unsigned null_exp_dest = (ctx.program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
1747 bool ignore_early_exit = true;
1748
1749 for (unsigned k = j + 1; k < block->instructions.size(); ++k) {
1750 const aco_ptr<Instruction> &instr = block->instructions[k];
1751 if (instr->opcode == aco_opcode::s_endpgm ||
1752 instr->opcode == aco_opcode::p_logical_end)
1753 continue;
1754 else if (instr->opcode == aco_opcode::exp &&
1755 static_cast<Export_instruction *>(instr.get())->dest == null_exp_dest)
1756 continue;
1757 else if (instr->opcode == aco_opcode::p_parallelcopy &&
1758 instr->definitions[0].isFixed() &&
1759 instr->definitions[0].physReg() == exec)
1760 continue;
1761
1762 ignore_early_exit = false;
1763 }
1764
1765 if (ignore_early_exit)
1766 break;
1767 }
1768
1769 if (!discard_block) {
1770 discard_block = program->create_and_insert_block();
1771 block = &program->blocks[i];
1772
1773 bld.reset(discard_block);
1774 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
1775 0, V_008DFC_SQ_EXP_NULL, false, true, true);
1776 if (program->wb_smem_l1_on_end)
1777 bld.smem(aco_opcode::s_dcache_wb);
1778 bld.sopp(aco_opcode::s_endpgm);
1779
1780 bld.reset(&ctx.instructions);
1781 }
1782
1783 //TODO: exec can be zero here with block_kind_discard
1784
1785 assert(instr->operands[0].physReg() == scc);
1786 bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
1787
1788 discard_block->linear_preds.push_back(block->index);
1789 block->linear_succs.push_back(discard_block->index);
1790 break;
1791 }
1792 case aco_opcode::p_spill:
1793 {
1794 assert(instr->operands[0].regClass() == v1.as_linear());
1795 for (unsigned i = 0; i < instr->operands[2].size(); i++)
1796 bld.writelane(bld.def(v1, instr->operands[0].physReg()),
1797 Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
1798 Operand(instr->operands[1].constantValue() + i),
1799 instr->operands[0]);
1800 break;
1801 }
1802 case aco_opcode::p_reload:
1803 {
1804 assert(instr->operands[0].regClass() == v1.as_linear());
1805 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1806 bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
1807 instr->operands[0],
1808 Operand(instr->operands[1].constantValue() + i));
1809 break;
1810 }
1811 case aco_opcode::p_as_uniform:
1812 {
1813 if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) {
1814 std::map<PhysReg, copy_operation> copy_operations;
1815 copy_operations[instr->definitions[0].physReg()] = {instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
1816 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1817 } else {
1818 assert(instr->operands[0].regClass().type() == RegType::vgpr);
1819 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
1820 assert(instr->operands[0].size() == instr->definitions[0].size());
1821 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
1822 bld.vop1(aco_opcode::v_readfirstlane_b32,
1823 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
1824 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
1825 }
1826 }
1827 break;
1828 }
1829 case aco_opcode::p_bpermute:
1830 {
1831 if (ctx.program->chip_class <= GFX7)
1832 emit_gfx6_bpermute(program, instr, bld);
1833 else if (ctx.program->chip_class == GFX10 && ctx.program->wave_size == 64)
1834 emit_gfx10_wave64_bpermute(program, instr, bld);
1835 else
1836 unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
1837 }
1838 default:
1839 break;
1840 }
1841 } else if (instr->format == Format::PSEUDO_BRANCH) {
1842 Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(instr.get());
1843 /* check if all blocks from current to target are empty */
1844 bool can_remove = block->index < branch->target[0];
1845 for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) {
1846 if (program->blocks[i].instructions.size())
1847 can_remove = false;
1848 }
1849 if (can_remove)
1850 continue;
1851
1852 switch (instr->opcode) {
1853 case aco_opcode::p_branch:
1854 assert(block->linear_succs[0] == branch->target[0]);
1855 bld.sopp(aco_opcode::s_branch, branch->target[0]);
1856 break;
1857 case aco_opcode::p_cbranch_nz:
1858 assert(block->linear_succs[1] == branch->target[0]);
1859 if (branch->operands[0].physReg() == exec)
1860 bld.sopp(aco_opcode::s_cbranch_execnz, branch->target[0]);
1861 else if (branch->operands[0].physReg() == vcc)
1862 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->target[0]);
1863 else {
1864 assert(branch->operands[0].physReg() == scc);
1865 bld.sopp(aco_opcode::s_cbranch_scc1, branch->target[0]);
1866 }
1867 break;
1868 case aco_opcode::p_cbranch_z:
1869 assert(block->linear_succs[1] == branch->target[0]);
1870 if (branch->operands[0].physReg() == exec)
1871 bld.sopp(aco_opcode::s_cbranch_execz, branch->target[0]);
1872 else if (branch->operands[0].physReg() == vcc)
1873 bld.sopp(aco_opcode::s_cbranch_vccz, branch->target[0]);
1874 else {
1875 assert(branch->operands[0].physReg() == scc);
1876 bld.sopp(aco_opcode::s_cbranch_scc0, branch->target[0]);
1877 }
1878 break;
1879 default:
1880 unreachable("Unknown Pseudo branch instruction!");
1881 }
1882
1883 } else if (instr->format == Format::PSEUDO_REDUCTION) {
1884 Pseudo_reduction_instruction* reduce = static_cast<Pseudo_reduction_instruction*>(instr.get());
1885 emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size,
1886 reduce->operands[1].physReg(), // tmp
1887 reduce->definitions[1].physReg(), // stmp
1888 reduce->operands[2].physReg(), // vtmp
1889 reduce->definitions[2].physReg(), // sitmp
1890 reduce->operands[0], reduce->definitions[0]);
1891 } else {
1892 ctx.instructions.emplace_back(std::move(instr));
1893 }
1894
1895 }
1896 block->instructions.swap(ctx.instructions);
1897 }
1898 }
1899
1900 }