b92d7c0eb5f970fb1e8e7b5abb999b916f15b44e
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <map>
29
30 #include "ac_shader_util.h"
31 #include "aco_ir.h"
32 #include "aco_builder.h"
33 #include "aco_interface.h"
34 #include "aco_instruction_selection_setup.cpp"
35 #include "util/fast_idiv_by_const.h"
36
37 namespace aco {
38 namespace {
39
40 class loop_info_RAII {
41 isel_context* ctx;
42 unsigned header_idx_old;
43 Block* exit_old;
44 bool divergent_cont_old;
45 bool divergent_branch_old;
46 bool divergent_if_old;
47
48 public:
49 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
50 : ctx(ctx),
51 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
52 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
53 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
54 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
55 {
56 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
57 ctx->cf_info.parent_loop.exit = loop_exit;
58 ctx->cf_info.parent_loop.has_divergent_continue = false;
59 ctx->cf_info.parent_loop.has_divergent_branch = false;
60 ctx->cf_info.parent_if.is_divergent = false;
61 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
62 }
63
64 ~loop_info_RAII()
65 {
66 ctx->cf_info.parent_loop.header_idx = header_idx_old;
67 ctx->cf_info.parent_loop.exit = exit_old;
68 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
69 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
70 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
71 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
72 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
73 ctx->cf_info.exec_potentially_empty = false;
74 }
75 };
76
77 struct if_context {
78 Temp cond;
79
80 bool divergent_old;
81 bool exec_potentially_empty_old;
82
83 unsigned BB_if_idx;
84 unsigned invert_idx;
85 bool then_branch_divergent;
86 Block BB_invert;
87 Block BB_endif;
88 };
89
90 static void visit_cf_list(struct isel_context *ctx,
91 struct exec_list *list);
92
93 static void add_logical_edge(unsigned pred_idx, Block *succ)
94 {
95 succ->logical_preds.emplace_back(pred_idx);
96 }
97
98
99 static void add_linear_edge(unsigned pred_idx, Block *succ)
100 {
101 succ->linear_preds.emplace_back(pred_idx);
102 }
103
104 static void add_edge(unsigned pred_idx, Block *succ)
105 {
106 add_logical_edge(pred_idx, succ);
107 add_linear_edge(pred_idx, succ);
108 }
109
110 static void append_logical_start(Block *b)
111 {
112 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
113 }
114
115 static void append_logical_end(Block *b)
116 {
117 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
118 }
119
120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
121 {
122 assert(ctx->allocated[def->index].id());
123 return ctx->allocated[def->index];
124 }
125
126 Temp emit_mbcnt(isel_context *ctx, Definition dst,
127 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
128 {
129 Builder bld(ctx->program, ctx->block);
130 Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
131 Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
132
133 if (ctx->program->wave_size == 32) {
134 return thread_id_lo;
135 } else {
136 Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
137 return thread_id_hi;
138 }
139 }
140
141 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
142 {
143 Builder bld(ctx->program, ctx->block);
144
145 if (!dst.id())
146 dst = bld.tmp(src.regClass());
147
148 assert(src.size() == dst.size());
149
150 if (ctx->stage != fragment_fs) {
151 if (!dst.id())
152 return src;
153
154 bld.copy(Definition(dst), src);
155 return dst;
156 }
157
158 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
159 ctx->program->needs_wqm |= program_needs_wqm;
160 return dst;
161 }
162
163 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
164 {
165 if (index.regClass() == s1)
166 return bld.readlane(bld.def(s1), data, index);
167
168 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
169
170 /* Currently not implemented on GFX6-7 */
171 assert(ctx->options->chip_class >= GFX8);
172
173 if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
174 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
175 }
176
177 /* GFX10, wave64 mode:
178 * The bpermute instruction is limited to half-wave operation, which means that it can't
179 * properly support subgroup shuffle like older generations (or wave32 mode), so we
180 * emulate it here.
181 */
182 if (!ctx->has_gfx10_wave64_bpermute) {
183 ctx->has_gfx10_wave64_bpermute = true;
184 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
185 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
186 }
187
188 Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
189 Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
190 Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
191 Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi);
192
193 return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
194 bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
195 }
196
197 Temp as_vgpr(isel_context *ctx, Temp val)
198 {
199 if (val.type() == RegType::sgpr) {
200 Builder bld(ctx->program, ctx->block);
201 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
202 }
203 assert(val.type() == RegType::vgpr);
204 return val;
205 }
206
207 //assumes a != 0xffffffff
208 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
209 {
210 assert(b != 0);
211 Builder bld(ctx->program, ctx->block);
212
213 if (util_is_power_of_two_or_zero(b)) {
214 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
215 return;
216 }
217
218 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
219
220 assert(info.multiplier <= 0xffffffff);
221
222 bool pre_shift = info.pre_shift != 0;
223 bool increment = info.increment != 0;
224 bool multiply = true;
225 bool post_shift = info.post_shift != 0;
226
227 if (!pre_shift && !increment && !multiply && !post_shift) {
228 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
229 return;
230 }
231
232 Temp pre_shift_dst = a;
233 if (pre_shift) {
234 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
235 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
236 }
237
238 Temp increment_dst = pre_shift_dst;
239 if (increment) {
240 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
241 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
242 }
243
244 Temp multiply_dst = increment_dst;
245 if (multiply) {
246 multiply_dst = post_shift ? bld.tmp(v1) : dst;
247 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
248 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
249 }
250
251 if (post_shift) {
252 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
253 }
254 }
255
256 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
257 {
258 Builder bld(ctx->program, ctx->block);
259 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
260 }
261
262
263 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
264 {
265 /* no need to extract the whole vector */
266 if (src.regClass() == dst_rc) {
267 assert(idx == 0);
268 return src;
269 }
270 assert(src.size() > idx);
271 Builder bld(ctx->program, ctx->block);
272 auto it = ctx->allocated_vec.find(src.id());
273 /* the size check needs to be early because elements other than 0 may be garbage */
274 if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
275 if (it->second[idx].regClass() == dst_rc) {
276 return it->second[idx];
277 } else {
278 assert(dst_rc.size() == it->second[idx].regClass().size());
279 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
280 return bld.copy(bld.def(dst_rc), it->second[idx]);
281 }
282 }
283
284 if (src.size() == dst_rc.size()) {
285 assert(idx == 0);
286 return bld.copy(bld.def(dst_rc), src);
287 } else {
288 Temp dst = bld.tmp(dst_rc);
289 emit_extract_vector(ctx, src, idx, dst);
290 return dst;
291 }
292 }
293
294 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
295 {
296 if (num_components == 1)
297 return;
298 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
299 return;
300 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
301 split->operands[0] = Operand(vec_src);
302 std::array<Temp,4> elems;
303 for (unsigned i = 0; i < num_components; i++) {
304 elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
305 split->definitions[i] = Definition(elems[i]);
306 }
307 ctx->block->instructions.emplace_back(std::move(split));
308 ctx->allocated_vec.emplace(vec_src.id(), elems);
309 }
310
311 /* This vector expansion uses a mask to determine which elements in the new vector
312 * come from the original vector. The other elements are undefined. */
313 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
314 {
315 emit_split_vector(ctx, vec_src, util_bitcount(mask));
316
317 if (vec_src == dst)
318 return;
319
320 Builder bld(ctx->program, ctx->block);
321 if (num_components == 1) {
322 if (dst.type() == RegType::sgpr)
323 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
324 else
325 bld.copy(Definition(dst), vec_src);
326 return;
327 }
328
329 unsigned component_size = dst.size() / num_components;
330 std::array<Temp,4> elems;
331
332 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
333 vec->definitions[0] = Definition(dst);
334 unsigned k = 0;
335 for (unsigned i = 0; i < num_components; i++) {
336 if (mask & (1 << i)) {
337 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
338 if (dst.type() == RegType::sgpr)
339 src = bld.as_uniform(src);
340 vec->operands[i] = Operand(src);
341 } else {
342 vec->operands[i] = Operand(0u);
343 }
344 elems[i] = vec->operands[i].getTemp();
345 }
346 ctx->block->instructions.emplace_back(std::move(vec));
347 ctx->allocated_vec.emplace(dst.id(), elems);
348 }
349
350 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
351 {
352 Builder bld(ctx->program, ctx->block);
353 if (!dst.id())
354 dst = bld.tmp(bld.lm);
355
356 assert(val.regClass() == s1);
357 assert(dst.regClass() == bld.lm);
358
359 return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
360 }
361
362 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
363 {
364 Builder bld(ctx->program, ctx->block);
365 if (!dst.id())
366 dst = bld.tmp(s1);
367
368 assert(val.regClass() == bld.lm);
369 assert(dst.regClass() == s1);
370
371 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
372 Temp tmp = bld.tmp(s1);
373 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
374 return emit_wqm(ctx, tmp, dst);
375 }
376
377 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
378 {
379 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
380 return get_ssa_temp(ctx, src.src.ssa);
381
382 if (src.src.ssa->num_components == size) {
383 bool identity_swizzle = true;
384 for (unsigned i = 0; identity_swizzle && i < size; i++) {
385 if (src.swizzle[i] != i)
386 identity_swizzle = false;
387 }
388 if (identity_swizzle)
389 return get_ssa_temp(ctx, src.src.ssa);
390 }
391
392 Temp vec = get_ssa_temp(ctx, src.src.ssa);
393 unsigned elem_size = vec.size() / src.src.ssa->num_components;
394 assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
395 assert(vec.size() % elem_size == 0);
396
397 RegClass elem_rc = RegClass(vec.type(), elem_size);
398 if (size == 1) {
399 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
400 } else {
401 assert(size <= 4);
402 std::array<Temp,4> elems;
403 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
404 for (unsigned i = 0; i < size; ++i) {
405 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
406 vec_instr->operands[i] = Operand{elems[i]};
407 }
408 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
409 vec_instr->definitions[0] = Definition(dst);
410 ctx->block->instructions.emplace_back(std::move(vec_instr));
411 ctx->allocated_vec.emplace(dst.id(), elems);
412 return dst;
413 }
414 }
415
416 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
417 {
418 if (ptr.size() == 2)
419 return ptr;
420 Builder bld(ctx->program, ctx->block);
421 if (ptr.type() == RegType::vgpr)
422 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
423 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
424 ptr, Operand((unsigned)ctx->options->address32_hi));
425 }
426
427 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
428 {
429 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
430 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
431 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
432 sop2->definitions[0] = Definition(dst);
433 if (writes_scc)
434 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
435 ctx->block->instructions.emplace_back(std::move(sop2));
436 }
437
438 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
439 bool commutative, bool swap_srcs=false, bool flush_denorms = false)
440 {
441 Builder bld(ctx->program, ctx->block);
442 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
443 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
444 if (src1.type() == RegType::sgpr) {
445 if (commutative && src0.type() == RegType::vgpr) {
446 Temp t = src0;
447 src0 = src1;
448 src1 = t;
449 } else if (src0.type() == RegType::vgpr &&
450 op != aco_opcode::v_madmk_f32 &&
451 op != aco_opcode::v_madak_f32 &&
452 op != aco_opcode::v_madmk_f16 &&
453 op != aco_opcode::v_madak_f16) {
454 /* If the instruction is not commutative, we emit a VOP3A instruction */
455 bld.vop2_e64(op, Definition(dst), src0, src1);
456 return;
457 } else {
458 src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
459 }
460 }
461
462 if (flush_denorms && ctx->program->chip_class < GFX9) {
463 assert(dst.size() == 1);
464 Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
465 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
466 } else {
467 bld.vop2(op, Definition(dst), src0, src1);
468 }
469 }
470
471 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
472 bool flush_denorms = false)
473 {
474 Temp src0 = get_alu_src(ctx, instr->src[0]);
475 Temp src1 = get_alu_src(ctx, instr->src[1]);
476 Temp src2 = get_alu_src(ctx, instr->src[2]);
477
478 /* ensure that the instruction has at most 1 sgpr operand
479 * The optimizer will inline constants for us */
480 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
481 src0 = as_vgpr(ctx, src0);
482 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
483 src1 = as_vgpr(ctx, src1);
484 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
485 src2 = as_vgpr(ctx, src2);
486
487 Builder bld(ctx->program, ctx->block);
488 if (flush_denorms && ctx->program->chip_class < GFX9) {
489 assert(dst.size() == 1);
490 Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
491 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
492 } else {
493 bld.vop3(op, Definition(dst), src0, src1, src2);
494 }
495 }
496
497 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
498 {
499 Builder bld(ctx->program, ctx->block);
500 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
501 }
502
503 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
504 {
505 Temp src0 = get_alu_src(ctx, instr->src[0]);
506 Temp src1 = get_alu_src(ctx, instr->src[1]);
507 assert(src0.size() == src1.size());
508
509 aco_ptr<Instruction> vopc;
510 if (src1.type() == RegType::sgpr) {
511 if (src0.type() == RegType::vgpr) {
512 /* to swap the operands, we might also have to change the opcode */
513 switch (op) {
514 case aco_opcode::v_cmp_lt_f32:
515 op = aco_opcode::v_cmp_gt_f32;
516 break;
517 case aco_opcode::v_cmp_ge_f32:
518 op = aco_opcode::v_cmp_le_f32;
519 break;
520 case aco_opcode::v_cmp_lt_i32:
521 op = aco_opcode::v_cmp_gt_i32;
522 break;
523 case aco_opcode::v_cmp_ge_i32:
524 op = aco_opcode::v_cmp_le_i32;
525 break;
526 case aco_opcode::v_cmp_lt_u32:
527 op = aco_opcode::v_cmp_gt_u32;
528 break;
529 case aco_opcode::v_cmp_ge_u32:
530 op = aco_opcode::v_cmp_le_u32;
531 break;
532 case aco_opcode::v_cmp_lt_f64:
533 op = aco_opcode::v_cmp_gt_f64;
534 break;
535 case aco_opcode::v_cmp_ge_f64:
536 op = aco_opcode::v_cmp_le_f64;
537 break;
538 case aco_opcode::v_cmp_lt_i64:
539 op = aco_opcode::v_cmp_gt_i64;
540 break;
541 case aco_opcode::v_cmp_ge_i64:
542 op = aco_opcode::v_cmp_le_i64;
543 break;
544 case aco_opcode::v_cmp_lt_u64:
545 op = aco_opcode::v_cmp_gt_u64;
546 break;
547 case aco_opcode::v_cmp_ge_u64:
548 op = aco_opcode::v_cmp_le_u64;
549 break;
550 default: /* eq and ne are commutative */
551 break;
552 }
553 Temp t = src0;
554 src0 = src1;
555 src1 = t;
556 } else {
557 src1 = as_vgpr(ctx, src1);
558 }
559 }
560
561 Builder bld(ctx->program, ctx->block);
562 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
563 }
564
565 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
566 {
567 Temp src0 = get_alu_src(ctx, instr->src[0]);
568 Temp src1 = get_alu_src(ctx, instr->src[1]);
569 Builder bld(ctx->program, ctx->block);
570
571 assert(dst.regClass() == bld.lm);
572 assert(src0.type() == RegType::sgpr);
573 assert(src1.type() == RegType::sgpr);
574 assert(src0.regClass() == src1.regClass());
575
576 /* Emit the SALU comparison instruction */
577 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
578 /* Turn the result into a per-lane bool */
579 bool_to_vector_condition(ctx, cmp, dst);
580 }
581
582 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
583 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
584 {
585 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
586 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
587 bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
588 bool use_valu = s_op == aco_opcode::num_opcodes ||
589 divergent_vals ||
590 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
591 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
592 aco_opcode op = use_valu ? v_op : s_op;
593 assert(op != aco_opcode::num_opcodes);
594
595 if (use_valu)
596 emit_vopc_instruction(ctx, instr, op, dst);
597 else
598 emit_sopc_instruction(ctx, instr, op, dst);
599 }
600
601 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
602 {
603 Builder bld(ctx->program, ctx->block);
604 Temp src0 = get_alu_src(ctx, instr->src[0]);
605 Temp src1 = get_alu_src(ctx, instr->src[1]);
606
607 assert(dst.regClass() == bld.lm);
608 assert(src0.regClass() == bld.lm);
609 assert(src1.regClass() == bld.lm);
610
611 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
612 }
613
614 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
615 {
616 Builder bld(ctx->program, ctx->block);
617 Temp cond = get_alu_src(ctx, instr->src[0]);
618 Temp then = get_alu_src(ctx, instr->src[1]);
619 Temp els = get_alu_src(ctx, instr->src[2]);
620
621 assert(cond.regClass() == bld.lm);
622
623 if (dst.type() == RegType::vgpr) {
624 aco_ptr<Instruction> bcsel;
625 if (dst.size() == 1) {
626 then = as_vgpr(ctx, then);
627 els = as_vgpr(ctx, els);
628
629 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
630 } else if (dst.size() == 2) {
631 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
632 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
633 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
634 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
635
636 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
637 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
638
639 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
640 } else {
641 fprintf(stderr, "Unimplemented NIR instr bit size: ");
642 nir_print_instr(&instr->instr, stderr);
643 fprintf(stderr, "\n");
644 }
645 return;
646 }
647
648 if (instr->dest.dest.ssa.bit_size == 1) {
649 assert(dst.regClass() == bld.lm);
650 assert(then.regClass() == bld.lm);
651 assert(els.regClass() == bld.lm);
652 }
653
654 if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
655 if (dst.regClass() == s1 || dst.regClass() == s2) {
656 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
657 assert(dst.size() == then.size());
658 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
659 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
660 } else {
661 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
662 nir_print_instr(&instr->instr, stderr);
663 fprintf(stderr, "\n");
664 }
665 return;
666 }
667
668 /* divergent boolean bcsel
669 * this implements bcsel on bools: dst = s0 ? s1 : s2
670 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
671 assert(instr->dest.dest.ssa.bit_size == 1);
672
673 if (cond.id() != then.id())
674 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
675
676 if (cond.id() == els.id())
677 bld.sop1(Builder::s_mov, Definition(dst), then);
678 else
679 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
680 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
681 }
682
683 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
684 aco_opcode op, uint32_t undo)
685 {
686 /* multiply by 16777216 to handle denormals */
687 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
688 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
689 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
690 scaled = bld.vop1(op, bld.def(v1), scaled);
691 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
692
693 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
694
695 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
696 }
697
698 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
699 {
700 if (ctx->block->fp_mode.denorm32 == 0) {
701 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
702 return;
703 }
704
705 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
706 }
707
708 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
709 {
710 if (ctx->block->fp_mode.denorm32 == 0) {
711 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
712 return;
713 }
714
715 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
716 }
717
718 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
719 {
720 if (ctx->block->fp_mode.denorm32 == 0) {
721 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
722 return;
723 }
724
725 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
726 }
727
728 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
729 {
730 if (ctx->block->fp_mode.denorm32 == 0) {
731 bld.vop1(aco_opcode::v_log_f32, dst, val);
732 return;
733 }
734
735 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
736 }
737
738 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
739 {
740 if (!instr->dest.dest.is_ssa) {
741 fprintf(stderr, "nir alu dst not in ssa: ");
742 nir_print_instr(&instr->instr, stderr);
743 fprintf(stderr, "\n");
744 abort();
745 }
746 Builder bld(ctx->program, ctx->block);
747 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
748 switch(instr->op) {
749 case nir_op_vec2:
750 case nir_op_vec3:
751 case nir_op_vec4: {
752 std::array<Temp,4> elems;
753 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
754 for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
755 elems[i] = get_alu_src(ctx, instr->src[i]);
756 vec->operands[i] = Operand{elems[i]};
757 }
758 vec->definitions[0] = Definition(dst);
759 ctx->block->instructions.emplace_back(std::move(vec));
760 ctx->allocated_vec.emplace(dst.id(), elems);
761 break;
762 }
763 case nir_op_mov: {
764 Temp src = get_alu_src(ctx, instr->src[0]);
765 aco_ptr<Instruction> mov;
766 if (dst.type() == RegType::sgpr) {
767 if (src.type() == RegType::vgpr)
768 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
769 else if (src.regClass() == s1)
770 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
771 else if (src.regClass() == s2)
772 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
773 else
774 unreachable("wrong src register class for nir_op_imov");
775 } else if (dst.regClass() == v1) {
776 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
777 } else if (dst.regClass() == v2) {
778 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
779 } else {
780 nir_print_instr(&instr->instr, stderr);
781 unreachable("Should have been lowered to scalar.");
782 }
783 break;
784 }
785 case nir_op_inot: {
786 Temp src = get_alu_src(ctx, instr->src[0]);
787 if (instr->dest.dest.ssa.bit_size == 1) {
788 assert(src.regClass() == bld.lm);
789 assert(dst.regClass() == bld.lm);
790 bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src);
791 } else if (dst.regClass() == v1) {
792 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
793 } else if (dst.type() == RegType::sgpr) {
794 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
795 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
796 } else {
797 fprintf(stderr, "Unimplemented NIR instr bit size: ");
798 nir_print_instr(&instr->instr, stderr);
799 fprintf(stderr, "\n");
800 }
801 break;
802 }
803 case nir_op_ineg: {
804 Temp src = get_alu_src(ctx, instr->src[0]);
805 if (dst.regClass() == v1) {
806 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
807 } else if (dst.regClass() == s1) {
808 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
809 } else if (dst.size() == 2) {
810 Temp src0 = bld.tmp(dst.type(), 1);
811 Temp src1 = bld.tmp(dst.type(), 1);
812 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
813
814 if (dst.regClass() == s2) {
815 Temp carry = bld.tmp(s1);
816 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
817 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
818 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
819 } else {
820 Temp lower = bld.tmp(v1);
821 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
822 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
823 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
824 }
825 } else {
826 fprintf(stderr, "Unimplemented NIR instr bit size: ");
827 nir_print_instr(&instr->instr, stderr);
828 fprintf(stderr, "\n");
829 }
830 break;
831 }
832 case nir_op_iabs: {
833 if (dst.regClass() == s1) {
834 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
835 } else if (dst.regClass() == v1) {
836 Temp src = get_alu_src(ctx, instr->src[0]);
837 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
838 } else {
839 fprintf(stderr, "Unimplemented NIR instr bit size: ");
840 nir_print_instr(&instr->instr, stderr);
841 fprintf(stderr, "\n");
842 }
843 break;
844 }
845 case nir_op_isign: {
846 Temp src = get_alu_src(ctx, instr->src[0]);
847 if (dst.regClass() == s1) {
848 Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
849 Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
850 bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
851 } else if (dst.regClass() == s2) {
852 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
853 Temp neqz;
854 if (ctx->program->chip_class >= GFX8)
855 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
856 else
857 neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
858 /* SCC gets zero-extended to 64 bit */
859 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
860 } else if (dst.regClass() == v1) {
861 Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
862 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
863 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
864 } else if (dst.regClass() == v2) {
865 Temp upper = emit_extract_vector(ctx, src, 1, v1);
866 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
867 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
868 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
869 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
870 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
871 } else {
872 fprintf(stderr, "Unimplemented NIR instr bit size: ");
873 nir_print_instr(&instr->instr, stderr);
874 fprintf(stderr, "\n");
875 }
876 break;
877 }
878 case nir_op_imax: {
879 if (dst.regClass() == v1) {
880 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
881 } else if (dst.regClass() == s1) {
882 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
883 } else {
884 fprintf(stderr, "Unimplemented NIR instr bit size: ");
885 nir_print_instr(&instr->instr, stderr);
886 fprintf(stderr, "\n");
887 }
888 break;
889 }
890 case nir_op_umax: {
891 if (dst.regClass() == v1) {
892 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
893 } else if (dst.regClass() == s1) {
894 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
895 } else {
896 fprintf(stderr, "Unimplemented NIR instr bit size: ");
897 nir_print_instr(&instr->instr, stderr);
898 fprintf(stderr, "\n");
899 }
900 break;
901 }
902 case nir_op_imin: {
903 if (dst.regClass() == v1) {
904 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
905 } else if (dst.regClass() == s1) {
906 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
907 } else {
908 fprintf(stderr, "Unimplemented NIR instr bit size: ");
909 nir_print_instr(&instr->instr, stderr);
910 fprintf(stderr, "\n");
911 }
912 break;
913 }
914 case nir_op_umin: {
915 if (dst.regClass() == v1) {
916 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
917 } else if (dst.regClass() == s1) {
918 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
919 } else {
920 fprintf(stderr, "Unimplemented NIR instr bit size: ");
921 nir_print_instr(&instr->instr, stderr);
922 fprintf(stderr, "\n");
923 }
924 break;
925 }
926 case nir_op_ior: {
927 if (instr->dest.dest.ssa.bit_size == 1) {
928 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
929 } else if (dst.regClass() == v1) {
930 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
931 } else if (dst.regClass() == s1) {
932 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
933 } else if (dst.regClass() == s2) {
934 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
935 } else {
936 fprintf(stderr, "Unimplemented NIR instr bit size: ");
937 nir_print_instr(&instr->instr, stderr);
938 fprintf(stderr, "\n");
939 }
940 break;
941 }
942 case nir_op_iand: {
943 if (instr->dest.dest.ssa.bit_size == 1) {
944 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
945 } else if (dst.regClass() == v1) {
946 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
947 } else if (dst.regClass() == s1) {
948 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
949 } else if (dst.regClass() == s2) {
950 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
951 } else {
952 fprintf(stderr, "Unimplemented NIR instr bit size: ");
953 nir_print_instr(&instr->instr, stderr);
954 fprintf(stderr, "\n");
955 }
956 break;
957 }
958 case nir_op_ixor: {
959 if (instr->dest.dest.ssa.bit_size == 1) {
960 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
961 } else if (dst.regClass() == v1) {
962 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
963 } else if (dst.regClass() == s1) {
964 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
965 } else if (dst.regClass() == s2) {
966 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
967 } else {
968 fprintf(stderr, "Unimplemented NIR instr bit size: ");
969 nir_print_instr(&instr->instr, stderr);
970 fprintf(stderr, "\n");
971 }
972 break;
973 }
974 case nir_op_ushr: {
975 if (dst.regClass() == v1) {
976 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
977 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
978 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
979 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
980 } else if (dst.regClass() == v2) {
981 bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
982 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
983 } else if (dst.regClass() == s2) {
984 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
985 } else if (dst.regClass() == s1) {
986 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
987 } else {
988 fprintf(stderr, "Unimplemented NIR instr bit size: ");
989 nir_print_instr(&instr->instr, stderr);
990 fprintf(stderr, "\n");
991 }
992 break;
993 }
994 case nir_op_ishl: {
995 if (dst.regClass() == v1) {
996 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
997 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
998 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
999 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1000 } else if (dst.regClass() == v2) {
1001 bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1002 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1003 } else if (dst.regClass() == s1) {
1004 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1005 } else if (dst.regClass() == s2) {
1006 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1007 } else {
1008 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1009 nir_print_instr(&instr->instr, stderr);
1010 fprintf(stderr, "\n");
1011 }
1012 break;
1013 }
1014 case nir_op_ishr: {
1015 if (dst.regClass() == v1) {
1016 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1017 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1018 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1019 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1020 } else if (dst.regClass() == v2) {
1021 bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1022 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1023 } else if (dst.regClass() == s1) {
1024 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1025 } else if (dst.regClass() == s2) {
1026 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1027 } else {
1028 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1029 nir_print_instr(&instr->instr, stderr);
1030 fprintf(stderr, "\n");
1031 }
1032 break;
1033 }
1034 case nir_op_find_lsb: {
1035 Temp src = get_alu_src(ctx, instr->src[0]);
1036 if (src.regClass() == s1) {
1037 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1038 } else if (src.regClass() == v1) {
1039 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1040 } else if (src.regClass() == s2) {
1041 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1042 } else {
1043 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1044 nir_print_instr(&instr->instr, stderr);
1045 fprintf(stderr, "\n");
1046 }
1047 break;
1048 }
1049 case nir_op_ufind_msb:
1050 case nir_op_ifind_msb: {
1051 Temp src = get_alu_src(ctx, instr->src[0]);
1052 if (src.regClass() == s1 || src.regClass() == s2) {
1053 aco_opcode op = src.regClass() == s2 ?
1054 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1055 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1056 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1057
1058 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1059 Operand(src.size() * 32u - 1u), msb_rev);
1060 Temp msb = sub.def(0).getTemp();
1061 Temp carry = sub.def(1).getTemp();
1062
1063 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
1064 } else if (src.regClass() == v1) {
1065 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1066 Temp msb_rev = bld.tmp(v1);
1067 emit_vop1_instruction(ctx, instr, op, msb_rev);
1068 Temp msb = bld.tmp(v1);
1069 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1070 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1071 } else {
1072 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1073 nir_print_instr(&instr->instr, stderr);
1074 fprintf(stderr, "\n");
1075 }
1076 break;
1077 }
1078 case nir_op_bitfield_reverse: {
1079 if (dst.regClass() == s1) {
1080 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1081 } else if (dst.regClass() == v1) {
1082 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1083 } else {
1084 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1085 nir_print_instr(&instr->instr, stderr);
1086 fprintf(stderr, "\n");
1087 }
1088 break;
1089 }
1090 case nir_op_iadd: {
1091 if (dst.regClass() == s1) {
1092 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1093 break;
1094 }
1095
1096 Temp src0 = get_alu_src(ctx, instr->src[0]);
1097 Temp src1 = get_alu_src(ctx, instr->src[1]);
1098 if (dst.regClass() == v1) {
1099 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1100 break;
1101 }
1102
1103 assert(src0.size() == 2 && src1.size() == 2);
1104 Temp src00 = bld.tmp(src0.type(), 1);
1105 Temp src01 = bld.tmp(dst.type(), 1);
1106 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1107 Temp src10 = bld.tmp(src1.type(), 1);
1108 Temp src11 = bld.tmp(dst.type(), 1);
1109 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1110
1111 if (dst.regClass() == s2) {
1112 Temp carry = bld.tmp(s1);
1113 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1114 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1115 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1116 } else if (dst.regClass() == v2) {
1117 Temp dst0 = bld.tmp(v1);
1118 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1119 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1120 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1121 } else {
1122 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1123 nir_print_instr(&instr->instr, stderr);
1124 fprintf(stderr, "\n");
1125 }
1126 break;
1127 }
1128 case nir_op_uadd_sat: {
1129 Temp src0 = get_alu_src(ctx, instr->src[0]);
1130 Temp src1 = get_alu_src(ctx, instr->src[1]);
1131 if (dst.regClass() == s1) {
1132 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1133 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1134 src0, src1);
1135 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1136 } else if (dst.regClass() == v1) {
1137 if (ctx->options->chip_class >= GFX9) {
1138 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1139 add->operands[0] = Operand(src0);
1140 add->operands[1] = Operand(src1);
1141 add->definitions[0] = Definition(dst);
1142 add->clamp = 1;
1143 ctx->block->instructions.emplace_back(std::move(add));
1144 } else {
1145 if (src1.regClass() != v1)
1146 std::swap(src0, src1);
1147 assert(src1.regClass() == v1);
1148 Temp tmp = bld.tmp(v1);
1149 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1150 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1151 }
1152 } else {
1153 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1154 nir_print_instr(&instr->instr, stderr);
1155 fprintf(stderr, "\n");
1156 }
1157 break;
1158 }
1159 case nir_op_uadd_carry: {
1160 Temp src0 = get_alu_src(ctx, instr->src[0]);
1161 Temp src1 = get_alu_src(ctx, instr->src[1]);
1162 if (dst.regClass() == s1) {
1163 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1164 break;
1165 }
1166 if (dst.regClass() == v1) {
1167 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1168 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1169 break;
1170 }
1171
1172 Temp src00 = bld.tmp(src0.type(), 1);
1173 Temp src01 = bld.tmp(dst.type(), 1);
1174 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1175 Temp src10 = bld.tmp(src1.type(), 1);
1176 Temp src11 = bld.tmp(dst.type(), 1);
1177 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1178 if (dst.regClass() == s2) {
1179 Temp carry = bld.tmp(s1);
1180 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1181 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1182 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1183 } else if (dst.regClass() == v2) {
1184 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1185 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1186 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1187 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1188 } else {
1189 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1190 nir_print_instr(&instr->instr, stderr);
1191 fprintf(stderr, "\n");
1192 }
1193 break;
1194 }
1195 case nir_op_isub: {
1196 if (dst.regClass() == s1) {
1197 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1198 break;
1199 }
1200
1201 Temp src0 = get_alu_src(ctx, instr->src[0]);
1202 Temp src1 = get_alu_src(ctx, instr->src[1]);
1203 if (dst.regClass() == v1) {
1204 bld.vsub32(Definition(dst), src0, src1);
1205 break;
1206 }
1207
1208 Temp src00 = bld.tmp(src0.type(), 1);
1209 Temp src01 = bld.tmp(dst.type(), 1);
1210 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1211 Temp src10 = bld.tmp(src1.type(), 1);
1212 Temp src11 = bld.tmp(dst.type(), 1);
1213 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1214 if (dst.regClass() == s2) {
1215 Temp carry = bld.tmp(s1);
1216 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1217 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1218 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1219 } else if (dst.regClass() == v2) {
1220 Temp lower = bld.tmp(v1);
1221 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1222 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1223 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1224 } else {
1225 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1226 nir_print_instr(&instr->instr, stderr);
1227 fprintf(stderr, "\n");
1228 }
1229 break;
1230 }
1231 case nir_op_usub_borrow: {
1232 Temp src0 = get_alu_src(ctx, instr->src[0]);
1233 Temp src1 = get_alu_src(ctx, instr->src[1]);
1234 if (dst.regClass() == s1) {
1235 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1236 break;
1237 } else if (dst.regClass() == v1) {
1238 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1239 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1240 break;
1241 }
1242
1243 Temp src00 = bld.tmp(src0.type(), 1);
1244 Temp src01 = bld.tmp(dst.type(), 1);
1245 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1246 Temp src10 = bld.tmp(src1.type(), 1);
1247 Temp src11 = bld.tmp(dst.type(), 1);
1248 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1249 if (dst.regClass() == s2) {
1250 Temp borrow = bld.tmp(s1);
1251 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1252 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1253 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1254 } else if (dst.regClass() == v2) {
1255 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1256 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1257 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1258 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1259 } else {
1260 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1261 nir_print_instr(&instr->instr, stderr);
1262 fprintf(stderr, "\n");
1263 }
1264 break;
1265 }
1266 case nir_op_imul: {
1267 if (dst.regClass() == v1) {
1268 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1269 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1270 } else if (dst.regClass() == s1) {
1271 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1272 } else {
1273 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1274 nir_print_instr(&instr->instr, stderr);
1275 fprintf(stderr, "\n");
1276 }
1277 break;
1278 }
1279 case nir_op_umul_high: {
1280 if (dst.regClass() == v1) {
1281 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1282 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1283 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1284 } else if (dst.regClass() == s1) {
1285 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1286 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1287 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1288 } else {
1289 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1290 nir_print_instr(&instr->instr, stderr);
1291 fprintf(stderr, "\n");
1292 }
1293 break;
1294 }
1295 case nir_op_imul_high: {
1296 if (dst.regClass() == v1) {
1297 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1298 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1299 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1300 } else if (dst.regClass() == s1) {
1301 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1302 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1303 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1304 } else {
1305 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1306 nir_print_instr(&instr->instr, stderr);
1307 fprintf(stderr, "\n");
1308 }
1309 break;
1310 }
1311 case nir_op_fmul: {
1312 if (dst.size() == 1) {
1313 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1314 } else if (dst.size() == 2) {
1315 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1316 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1317 } else {
1318 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1319 nir_print_instr(&instr->instr, stderr);
1320 fprintf(stderr, "\n");
1321 }
1322 break;
1323 }
1324 case nir_op_fadd: {
1325 if (dst.size() == 1) {
1326 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1327 } else if (dst.size() == 2) {
1328 bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1329 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1330 } else {
1331 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1332 nir_print_instr(&instr->instr, stderr);
1333 fprintf(stderr, "\n");
1334 }
1335 break;
1336 }
1337 case nir_op_fsub: {
1338 Temp src0 = get_alu_src(ctx, instr->src[0]);
1339 Temp src1 = get_alu_src(ctx, instr->src[1]);
1340 if (dst.size() == 1) {
1341 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1342 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1343 else
1344 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1345 } else if (dst.size() == 2) {
1346 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1347 get_alu_src(ctx, instr->src[0]),
1348 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1349 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1350 sub->neg[1] = true;
1351 } else {
1352 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1353 nir_print_instr(&instr->instr, stderr);
1354 fprintf(stderr, "\n");
1355 }
1356 break;
1357 }
1358 case nir_op_fmax: {
1359 if (dst.size() == 1) {
1360 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1361 } else if (dst.size() == 2) {
1362 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1363 Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2),
1364 get_alu_src(ctx, instr->src[0]),
1365 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1366 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1367 } else {
1368 bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1369 get_alu_src(ctx, instr->src[0]),
1370 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1371 }
1372 } else {
1373 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1374 nir_print_instr(&instr->instr, stderr);
1375 fprintf(stderr, "\n");
1376 }
1377 break;
1378 }
1379 case nir_op_fmin: {
1380 if (dst.size() == 1) {
1381 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1382 } else if (dst.size() == 2) {
1383 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1384 Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2),
1385 get_alu_src(ctx, instr->src[0]),
1386 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1387 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1388 } else {
1389 bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1390 get_alu_src(ctx, instr->src[0]),
1391 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1392 }
1393 } else {
1394 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1395 nir_print_instr(&instr->instr, stderr);
1396 fprintf(stderr, "\n");
1397 }
1398 break;
1399 }
1400 case nir_op_fmax3: {
1401 if (dst.size() == 1) {
1402 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1403 } else {
1404 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1405 nir_print_instr(&instr->instr, stderr);
1406 fprintf(stderr, "\n");
1407 }
1408 break;
1409 }
1410 case nir_op_fmin3: {
1411 if (dst.size() == 1) {
1412 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1413 } else {
1414 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1415 nir_print_instr(&instr->instr, stderr);
1416 fprintf(stderr, "\n");
1417 }
1418 break;
1419 }
1420 case nir_op_fmed3: {
1421 if (dst.size() == 1) {
1422 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1423 } else {
1424 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1425 nir_print_instr(&instr->instr, stderr);
1426 fprintf(stderr, "\n");
1427 }
1428 break;
1429 }
1430 case nir_op_umax3: {
1431 if (dst.size() == 1) {
1432 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1433 } else {
1434 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1435 nir_print_instr(&instr->instr, stderr);
1436 fprintf(stderr, "\n");
1437 }
1438 break;
1439 }
1440 case nir_op_umin3: {
1441 if (dst.size() == 1) {
1442 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1443 } else {
1444 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1445 nir_print_instr(&instr->instr, stderr);
1446 fprintf(stderr, "\n");
1447 }
1448 break;
1449 }
1450 case nir_op_umed3: {
1451 if (dst.size() == 1) {
1452 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1453 } else {
1454 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1455 nir_print_instr(&instr->instr, stderr);
1456 fprintf(stderr, "\n");
1457 }
1458 break;
1459 }
1460 case nir_op_imax3: {
1461 if (dst.size() == 1) {
1462 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1463 } else {
1464 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1465 nir_print_instr(&instr->instr, stderr);
1466 fprintf(stderr, "\n");
1467 }
1468 break;
1469 }
1470 case nir_op_imin3: {
1471 if (dst.size() == 1) {
1472 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1473 } else {
1474 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1475 nir_print_instr(&instr->instr, stderr);
1476 fprintf(stderr, "\n");
1477 }
1478 break;
1479 }
1480 case nir_op_imed3: {
1481 if (dst.size() == 1) {
1482 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1483 } else {
1484 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1485 nir_print_instr(&instr->instr, stderr);
1486 fprintf(stderr, "\n");
1487 }
1488 break;
1489 }
1490 case nir_op_cube_face_coord: {
1491 Temp in = get_alu_src(ctx, instr->src[0], 3);
1492 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1493 emit_extract_vector(ctx, in, 1, v1),
1494 emit_extract_vector(ctx, in, 2, v1) };
1495 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1496 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1497 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1498 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1499 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1500 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1501 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1502 break;
1503 }
1504 case nir_op_cube_face_index: {
1505 Temp in = get_alu_src(ctx, instr->src[0], 3);
1506 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1507 emit_extract_vector(ctx, in, 1, v1),
1508 emit_extract_vector(ctx, in, 2, v1) };
1509 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1510 break;
1511 }
1512 case nir_op_bcsel: {
1513 emit_bcsel(ctx, instr, dst);
1514 break;
1515 }
1516 case nir_op_frsq: {
1517 if (dst.size() == 1) {
1518 emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1519 } else if (dst.size() == 2) {
1520 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1521 } else {
1522 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1523 nir_print_instr(&instr->instr, stderr);
1524 fprintf(stderr, "\n");
1525 }
1526 break;
1527 }
1528 case nir_op_fneg: {
1529 Temp src = get_alu_src(ctx, instr->src[0]);
1530 if (dst.size() == 1) {
1531 if (ctx->block->fp_mode.must_flush_denorms32)
1532 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1533 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1534 } else if (dst.size() == 2) {
1535 if (ctx->block->fp_mode.must_flush_denorms16_64)
1536 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1537 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1538 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1539 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1540 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1541 } else {
1542 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1543 nir_print_instr(&instr->instr, stderr);
1544 fprintf(stderr, "\n");
1545 }
1546 break;
1547 }
1548 case nir_op_fabs: {
1549 Temp src = get_alu_src(ctx, instr->src[0]);
1550 if (dst.size() == 1) {
1551 if (ctx->block->fp_mode.must_flush_denorms32)
1552 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1553 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1554 } else if (dst.size() == 2) {
1555 if (ctx->block->fp_mode.must_flush_denorms16_64)
1556 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1557 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1558 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1559 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1560 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1561 } else {
1562 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1563 nir_print_instr(&instr->instr, stderr);
1564 fprintf(stderr, "\n");
1565 }
1566 break;
1567 }
1568 case nir_op_fsat: {
1569 Temp src = get_alu_src(ctx, instr->src[0]);
1570 if (dst.size() == 1) {
1571 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1572 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
1573 // TODO: confirm that this holds under any circumstances
1574 } else if (dst.size() == 2) {
1575 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1576 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1577 vop3->clamp = true;
1578 } else {
1579 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1580 nir_print_instr(&instr->instr, stderr);
1581 fprintf(stderr, "\n");
1582 }
1583 break;
1584 }
1585 case nir_op_flog2: {
1586 if (dst.size() == 1) {
1587 emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1588 } else {
1589 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1590 nir_print_instr(&instr->instr, stderr);
1591 fprintf(stderr, "\n");
1592 }
1593 break;
1594 }
1595 case nir_op_frcp: {
1596 if (dst.size() == 1) {
1597 emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1598 } else if (dst.size() == 2) {
1599 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1600 } else {
1601 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1602 nir_print_instr(&instr->instr, stderr);
1603 fprintf(stderr, "\n");
1604 }
1605 break;
1606 }
1607 case nir_op_fexp2: {
1608 if (dst.size() == 1) {
1609 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1610 } else {
1611 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1612 nir_print_instr(&instr->instr, stderr);
1613 fprintf(stderr, "\n");
1614 }
1615 break;
1616 }
1617 case nir_op_fsqrt: {
1618 if (dst.size() == 1) {
1619 emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1620 } else if (dst.size() == 2) {
1621 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1622 } else {
1623 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1624 nir_print_instr(&instr->instr, stderr);
1625 fprintf(stderr, "\n");
1626 }
1627 break;
1628 }
1629 case nir_op_ffract: {
1630 if (dst.size() == 1) {
1631 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1632 } else if (dst.size() == 2) {
1633 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1634 } else {
1635 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1636 nir_print_instr(&instr->instr, stderr);
1637 fprintf(stderr, "\n");
1638 }
1639 break;
1640 }
1641 case nir_op_ffloor: {
1642 if (dst.size() == 1) {
1643 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1644 } else if (dst.size() == 2) {
1645 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1646 } else {
1647 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1648 nir_print_instr(&instr->instr, stderr);
1649 fprintf(stderr, "\n");
1650 }
1651 break;
1652 }
1653 case nir_op_fceil: {
1654 if (dst.size() == 1) {
1655 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1656 } else if (dst.size() == 2) {
1657 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1658 } else {
1659 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1660 nir_print_instr(&instr->instr, stderr);
1661 fprintf(stderr, "\n");
1662 }
1663 break;
1664 }
1665 case nir_op_ftrunc: {
1666 if (dst.size() == 1) {
1667 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1668 } else if (dst.size() == 2) {
1669 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1670 } else {
1671 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1672 nir_print_instr(&instr->instr, stderr);
1673 fprintf(stderr, "\n");
1674 }
1675 break;
1676 }
1677 case nir_op_fround_even: {
1678 if (dst.size() == 1) {
1679 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1680 } else if (dst.size() == 2) {
1681 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1682 } else {
1683 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1684 nir_print_instr(&instr->instr, stderr);
1685 fprintf(stderr, "\n");
1686 }
1687 break;
1688 }
1689 case nir_op_fsin:
1690 case nir_op_fcos: {
1691 Temp src = get_alu_src(ctx, instr->src[0]);
1692 aco_ptr<Instruction> norm;
1693 if (dst.size() == 1) {
1694 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
1695 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src));
1696
1697 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1698 if (ctx->options->chip_class < GFX9)
1699 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1700
1701 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1702 bld.vop1(opcode, Definition(dst), tmp);
1703 } else {
1704 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1705 nir_print_instr(&instr->instr, stderr);
1706 fprintf(stderr, "\n");
1707 }
1708 break;
1709 }
1710 case nir_op_ldexp: {
1711 if (dst.size() == 1) {
1712 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1713 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1714 get_alu_src(ctx, instr->src[1]));
1715 } else if (dst.size() == 2) {
1716 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1717 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1718 get_alu_src(ctx, instr->src[1]));
1719 } else {
1720 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1721 nir_print_instr(&instr->instr, stderr);
1722 fprintf(stderr, "\n");
1723 }
1724 break;
1725 }
1726 case nir_op_frexp_sig: {
1727 if (dst.size() == 1) {
1728 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1729 get_alu_src(ctx, instr->src[0]));
1730 } else if (dst.size() == 2) {
1731 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1732 get_alu_src(ctx, instr->src[0]));
1733 } else {
1734 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1735 nir_print_instr(&instr->instr, stderr);
1736 fprintf(stderr, "\n");
1737 }
1738 break;
1739 }
1740 case nir_op_frexp_exp: {
1741 if (instr->src[0].src.ssa->bit_size == 32) {
1742 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1743 get_alu_src(ctx, instr->src[0]));
1744 } else if (instr->src[0].src.ssa->bit_size == 64) {
1745 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1746 get_alu_src(ctx, instr->src[0]));
1747 } else {
1748 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1749 nir_print_instr(&instr->instr, stderr);
1750 fprintf(stderr, "\n");
1751 }
1752 break;
1753 }
1754 case nir_op_fsign: {
1755 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1756 if (dst.size() == 1) {
1757 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1758 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1759 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1760 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1761 } else if (dst.size() == 2) {
1762 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1763 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1764 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
1765
1766 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1767 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1768 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1769
1770 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1771 } else {
1772 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1773 nir_print_instr(&instr->instr, stderr);
1774 fprintf(stderr, "\n");
1775 }
1776 break;
1777 }
1778 case nir_op_f2f32: {
1779 if (instr->src[0].src.ssa->bit_size == 64) {
1780 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1781 } else {
1782 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1783 nir_print_instr(&instr->instr, stderr);
1784 fprintf(stderr, "\n");
1785 }
1786 break;
1787 }
1788 case nir_op_f2f64: {
1789 if (instr->src[0].src.ssa->bit_size == 32) {
1790 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1791 } else {
1792 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1793 nir_print_instr(&instr->instr, stderr);
1794 fprintf(stderr, "\n");
1795 }
1796 break;
1797 }
1798 case nir_op_i2f32: {
1799 assert(dst.size() == 1);
1800 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1801 break;
1802 }
1803 case nir_op_i2f64: {
1804 if (instr->src[0].src.ssa->bit_size == 32) {
1805 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1806 } else if (instr->src[0].src.ssa->bit_size == 64) {
1807 Temp src = get_alu_src(ctx, instr->src[0]);
1808 RegClass rc = RegClass(src.type(), 1);
1809 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1810 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1811 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1812 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1813 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1814 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1815
1816 } else {
1817 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1818 nir_print_instr(&instr->instr, stderr);
1819 fprintf(stderr, "\n");
1820 }
1821 break;
1822 }
1823 case nir_op_u2f32: {
1824 assert(dst.size() == 1);
1825 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1826 break;
1827 }
1828 case nir_op_u2f64: {
1829 if (instr->src[0].src.ssa->bit_size == 32) {
1830 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1831 } else if (instr->src[0].src.ssa->bit_size == 64) {
1832 Temp src = get_alu_src(ctx, instr->src[0]);
1833 RegClass rc = RegClass(src.type(), 1);
1834 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1835 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1836 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1837 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1838 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1839 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1840 } else {
1841 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1842 nir_print_instr(&instr->instr, stderr);
1843 fprintf(stderr, "\n");
1844 }
1845 break;
1846 }
1847 case nir_op_f2i32: {
1848 Temp src = get_alu_src(ctx, instr->src[0]);
1849 if (instr->src[0].src.ssa->bit_size == 32) {
1850 if (dst.type() == RegType::vgpr)
1851 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1852 else
1853 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1854 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1855
1856 } else if (instr->src[0].src.ssa->bit_size == 64) {
1857 if (dst.type() == RegType::vgpr)
1858 bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1859 else
1860 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1861 bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1862
1863 } else {
1864 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1865 nir_print_instr(&instr->instr, stderr);
1866 fprintf(stderr, "\n");
1867 }
1868 break;
1869 }
1870 case nir_op_f2u32: {
1871 Temp src = get_alu_src(ctx, instr->src[0]);
1872 if (instr->src[0].src.ssa->bit_size == 32) {
1873 if (dst.type() == RegType::vgpr)
1874 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1875 else
1876 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1877 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1878
1879 } else if (instr->src[0].src.ssa->bit_size == 64) {
1880 if (dst.type() == RegType::vgpr)
1881 bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1882 else
1883 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1884 bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1885
1886 } else {
1887 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1888 nir_print_instr(&instr->instr, stderr);
1889 fprintf(stderr, "\n");
1890 }
1891 break;
1892 }
1893 case nir_op_f2i64: {
1894 Temp src = get_alu_src(ctx, instr->src[0]);
1895 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1896 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1897 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1898 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1899 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1900 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1901 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1902 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1903 Temp new_exponent = bld.tmp(v1);
1904 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1905 if (ctx->program->chip_class >= GFX8)
1906 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1907 else
1908 mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
1909 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1910 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1911 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1912 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1913 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1914 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1915 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1916 Temp new_lower = bld.tmp(v1);
1917 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1918 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1919 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1920
1921 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1922 if (src.type() == RegType::vgpr)
1923 src = bld.as_uniform(src);
1924 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1925 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1926 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1927 exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1928 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1929 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1930 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1931 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1932 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1933 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1934 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1935 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1936 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1937 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1938 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1939 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1940 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1941 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1942 Temp borrow = bld.tmp(s1);
1943 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1944 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1945 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1946
1947 } else if (instr->src[0].src.ssa->bit_size == 64) {
1948 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1949 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1950 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1951 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1952 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1953 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1954 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1955 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1956 if (dst.type() == RegType::sgpr) {
1957 lower = bld.as_uniform(lower);
1958 upper = bld.as_uniform(upper);
1959 }
1960 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1961
1962 } else {
1963 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1964 nir_print_instr(&instr->instr, stderr);
1965 fprintf(stderr, "\n");
1966 }
1967 break;
1968 }
1969 case nir_op_f2u64: {
1970 Temp src = get_alu_src(ctx, instr->src[0]);
1971 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1972 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1973 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
1974 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1975 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1976 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1977 Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1978 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1979 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1980 Temp new_exponent = bld.tmp(v1);
1981 Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1982 if (ctx->program->chip_class >= GFX8)
1983 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1984 else
1985 mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
1986 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1987 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1988 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1989 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1990 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1991 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1992 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1993
1994 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1995 if (src.type() == RegType::vgpr)
1996 src = bld.as_uniform(src);
1997 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1998 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1999 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2000 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2001 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2002 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
2003 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
2004 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2005 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
2006 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
2007 Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
2008 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
2009 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2010 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2011 Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
2012 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2013 upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
2014 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2015
2016 } else if (instr->src[0].src.ssa->bit_size == 64) {
2017 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2018 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
2019 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2020 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2021 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
2022 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2023 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2024 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2025 if (dst.type() == RegType::sgpr) {
2026 lower = bld.as_uniform(lower);
2027 upper = bld.as_uniform(upper);
2028 }
2029 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2030
2031 } else {
2032 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2033 nir_print_instr(&instr->instr, stderr);
2034 fprintf(stderr, "\n");
2035 }
2036 break;
2037 }
2038 case nir_op_b2f32: {
2039 Temp src = get_alu_src(ctx, instr->src[0]);
2040 assert(src.regClass() == bld.lm);
2041
2042 if (dst.regClass() == s1) {
2043 src = bool_to_scalar_condition(ctx, src);
2044 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2045 } else if (dst.regClass() == v1) {
2046 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2047 } else {
2048 unreachable("Wrong destination register class for nir_op_b2f32.");
2049 }
2050 break;
2051 }
2052 case nir_op_b2f64: {
2053 Temp src = get_alu_src(ctx, instr->src[0]);
2054 assert(src.regClass() == bld.lm);
2055
2056 if (dst.regClass() == s2) {
2057 src = bool_to_scalar_condition(ctx, src);
2058 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2059 } else if (dst.regClass() == v2) {
2060 Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2061 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2062 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2063 } else {
2064 unreachable("Wrong destination register class for nir_op_b2f64.");
2065 }
2066 break;
2067 }
2068 case nir_op_i2i32: {
2069 Temp src = get_alu_src(ctx, instr->src[0]);
2070 if (instr->src[0].src.ssa->bit_size == 64) {
2071 /* we can actually just say dst = src, as it would map the lower register */
2072 emit_extract_vector(ctx, src, 0, dst);
2073 } else {
2074 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2075 nir_print_instr(&instr->instr, stderr);
2076 fprintf(stderr, "\n");
2077 }
2078 break;
2079 }
2080 case nir_op_u2u32: {
2081 Temp src = get_alu_src(ctx, instr->src[0]);
2082 if (instr->src[0].src.ssa->bit_size == 16) {
2083 if (dst.regClass() == s1) {
2084 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
2085 } else {
2086 // TODO: do better with SDWA
2087 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
2088 }
2089 } else if (instr->src[0].src.ssa->bit_size == 64) {
2090 /* we can actually just say dst = src, as it would map the lower register */
2091 emit_extract_vector(ctx, src, 0, dst);
2092 } else {
2093 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2094 nir_print_instr(&instr->instr, stderr);
2095 fprintf(stderr, "\n");
2096 }
2097 break;
2098 }
2099 case nir_op_i2i64: {
2100 Temp src = get_alu_src(ctx, instr->src[0]);
2101 if (src.regClass() == s1) {
2102 Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
2103 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2104 } else if (src.regClass() == v1) {
2105 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2106 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2107 } else {
2108 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2109 nir_print_instr(&instr->instr, stderr);
2110 fprintf(stderr, "\n");
2111 }
2112 break;
2113 }
2114 case nir_op_u2u64: {
2115 Temp src = get_alu_src(ctx, instr->src[0]);
2116 if (instr->src[0].src.ssa->bit_size == 32) {
2117 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
2118 } else {
2119 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2120 nir_print_instr(&instr->instr, stderr);
2121 fprintf(stderr, "\n");
2122 }
2123 break;
2124 }
2125 case nir_op_b2i32: {
2126 Temp src = get_alu_src(ctx, instr->src[0]);
2127 assert(src.regClass() == bld.lm);
2128
2129 if (dst.regClass() == s1) {
2130 // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2131 bool_to_scalar_condition(ctx, src, dst);
2132 } else if (dst.regClass() == v1) {
2133 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2134 } else {
2135 unreachable("Invalid register class for b2i32");
2136 }
2137 break;
2138 }
2139 case nir_op_i2b1: {
2140 Temp src = get_alu_src(ctx, instr->src[0]);
2141 assert(dst.regClass() == bld.lm);
2142
2143 if (src.type() == RegType::vgpr) {
2144 assert(src.regClass() == v1 || src.regClass() == v2);
2145 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2146 Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2147 } else {
2148 assert(src.regClass() == s1 || src.regClass() == s2);
2149 Temp tmp;
2150 if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
2151 tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
2152 } else {
2153 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2154 bld.scc(bld.def(s1)), Operand(0u), src);
2155 }
2156 bool_to_vector_condition(ctx, tmp, dst);
2157 }
2158 break;
2159 }
2160 case nir_op_pack_64_2x32_split: {
2161 Temp src0 = get_alu_src(ctx, instr->src[0]);
2162 Temp src1 = get_alu_src(ctx, instr->src[1]);
2163
2164 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2165 break;
2166 }
2167 case nir_op_unpack_64_2x32_split_x:
2168 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2169 break;
2170 case nir_op_unpack_64_2x32_split_y:
2171 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2172 break;
2173 case nir_op_pack_half_2x16: {
2174 Temp src = get_alu_src(ctx, instr->src[0], 2);
2175
2176 if (dst.regClass() == v1) {
2177 Temp src0 = bld.tmp(v1);
2178 Temp src1 = bld.tmp(v1);
2179 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2180 if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2181 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2182 else
2183 bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2184 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2185 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2186 } else {
2187 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2188 nir_print_instr(&instr->instr, stderr);
2189 fprintf(stderr, "\n");
2190 }
2191 break;
2192 }
2193 case nir_op_unpack_half_2x16_split_x: {
2194 if (dst.regClass() == v1) {
2195 Builder bld(ctx->program, ctx->block);
2196 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2197 } else {
2198 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2199 nir_print_instr(&instr->instr, stderr);
2200 fprintf(stderr, "\n");
2201 }
2202 break;
2203 }
2204 case nir_op_unpack_half_2x16_split_y: {
2205 if (dst.regClass() == v1) {
2206 Builder bld(ctx->program, ctx->block);
2207 /* TODO: use SDWA here */
2208 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2209 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2210 } else {
2211 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2212 nir_print_instr(&instr->instr, stderr);
2213 fprintf(stderr, "\n");
2214 }
2215 break;
2216 }
2217 case nir_op_fquantize2f16: {
2218 Temp src = get_alu_src(ctx, instr->src[0]);
2219 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2220 Temp f32, cmp_res;
2221
2222 if (ctx->program->chip_class >= GFX8) {
2223 Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2224 cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2225 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2226 } else {
2227 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2228 * so compare the result and flush to 0 if it's smaller.
2229 */
2230 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2231 Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2232 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), f32, smallest);
2233 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2234 cmp_res = vop3->definitions[0].getTemp();
2235 }
2236
2237 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2238 Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2239 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2240 } else {
2241 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2242 }
2243 break;
2244 }
2245 case nir_op_bfm: {
2246 Temp bits = get_alu_src(ctx, instr->src[0]);
2247 Temp offset = get_alu_src(ctx, instr->src[1]);
2248
2249 if (dst.regClass() == s1) {
2250 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2251 } else if (dst.regClass() == v1) {
2252 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2253 } else {
2254 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2255 nir_print_instr(&instr->instr, stderr);
2256 fprintf(stderr, "\n");
2257 }
2258 break;
2259 }
2260 case nir_op_bitfield_select: {
2261 /* (mask & insert) | (~mask & base) */
2262 Temp bitmask = get_alu_src(ctx, instr->src[0]);
2263 Temp insert = get_alu_src(ctx, instr->src[1]);
2264 Temp base = get_alu_src(ctx, instr->src[2]);
2265
2266 /* dst = (insert & bitmask) | (base & ~bitmask) */
2267 if (dst.regClass() == s1) {
2268 aco_ptr<Instruction> sop2;
2269 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2270 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2271 Operand lhs;
2272 if (const_insert && const_bitmask) {
2273 lhs = Operand(const_insert->u32 & const_bitmask->u32);
2274 } else {
2275 insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2276 lhs = Operand(insert);
2277 }
2278
2279 Operand rhs;
2280 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2281 if (const_base && const_bitmask) {
2282 rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2283 } else {
2284 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2285 rhs = Operand(base);
2286 }
2287
2288 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2289
2290 } else if (dst.regClass() == v1) {
2291 if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2292 base = as_vgpr(ctx, base);
2293 if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2294 insert = as_vgpr(ctx, insert);
2295
2296 bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2297
2298 } else {
2299 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2300 nir_print_instr(&instr->instr, stderr);
2301 fprintf(stderr, "\n");
2302 }
2303 break;
2304 }
2305 case nir_op_ubfe:
2306 case nir_op_ibfe: {
2307 Temp base = get_alu_src(ctx, instr->src[0]);
2308 Temp offset = get_alu_src(ctx, instr->src[1]);
2309 Temp bits = get_alu_src(ctx, instr->src[2]);
2310
2311 if (dst.type() == RegType::sgpr) {
2312 Operand extract;
2313 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2314 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2315 if (const_offset && const_bits) {
2316 uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2317 extract = Operand(const_extract);
2318 } else {
2319 Operand width;
2320 if (const_bits) {
2321 width = Operand(const_bits->u32 << 16);
2322 } else {
2323 width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2324 }
2325 extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2326 }
2327
2328 aco_opcode opcode;
2329 if (dst.regClass() == s1) {
2330 if (instr->op == nir_op_ubfe)
2331 opcode = aco_opcode::s_bfe_u32;
2332 else
2333 opcode = aco_opcode::s_bfe_i32;
2334 } else if (dst.regClass() == s2) {
2335 if (instr->op == nir_op_ubfe)
2336 opcode = aco_opcode::s_bfe_u64;
2337 else
2338 opcode = aco_opcode::s_bfe_i64;
2339 } else {
2340 unreachable("Unsupported BFE bit size");
2341 }
2342
2343 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2344
2345 } else {
2346 aco_opcode opcode;
2347 if (dst.regClass() == v1) {
2348 if (instr->op == nir_op_ubfe)
2349 opcode = aco_opcode::v_bfe_u32;
2350 else
2351 opcode = aco_opcode::v_bfe_i32;
2352 } else {
2353 unreachable("Unsupported BFE bit size");
2354 }
2355
2356 emit_vop3a_instruction(ctx, instr, opcode, dst);
2357 }
2358 break;
2359 }
2360 case nir_op_bit_count: {
2361 Temp src = get_alu_src(ctx, instr->src[0]);
2362 if (src.regClass() == s1) {
2363 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2364 } else if (src.regClass() == v1) {
2365 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2366 } else if (src.regClass() == v2) {
2367 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2368 emit_extract_vector(ctx, src, 1, v1),
2369 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2370 emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2371 } else if (src.regClass() == s2) {
2372 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2373 } else {
2374 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2375 nir_print_instr(&instr->instr, stderr);
2376 fprintf(stderr, "\n");
2377 }
2378 break;
2379 }
2380 case nir_op_flt: {
2381 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2382 break;
2383 }
2384 case nir_op_fge: {
2385 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2386 break;
2387 }
2388 case nir_op_feq: {
2389 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2390 break;
2391 }
2392 case nir_op_fne: {
2393 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
2394 break;
2395 }
2396 case nir_op_ilt: {
2397 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
2398 break;
2399 }
2400 case nir_op_ige: {
2401 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
2402 break;
2403 }
2404 case nir_op_ieq: {
2405 if (instr->src[0].src.ssa->bit_size == 1)
2406 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
2407 else
2408 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
2409 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
2410 break;
2411 }
2412 case nir_op_ine: {
2413 if (instr->src[0].src.ssa->bit_size == 1)
2414 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
2415 else
2416 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
2417 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
2418 break;
2419 }
2420 case nir_op_ult: {
2421 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
2422 break;
2423 }
2424 case nir_op_uge: {
2425 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
2426 break;
2427 }
2428 case nir_op_fddx:
2429 case nir_op_fddy:
2430 case nir_op_fddx_fine:
2431 case nir_op_fddy_fine:
2432 case nir_op_fddx_coarse:
2433 case nir_op_fddy_coarse: {
2434 Temp src = get_alu_src(ctx, instr->src[0]);
2435 uint16_t dpp_ctrl1, dpp_ctrl2;
2436 if (instr->op == nir_op_fddx_fine) {
2437 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
2438 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
2439 } else if (instr->op == nir_op_fddy_fine) {
2440 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
2441 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
2442 } else {
2443 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
2444 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2445 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
2446 else
2447 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
2448 }
2449
2450 Temp tmp;
2451 if (ctx->program->chip_class >= GFX8) {
2452 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
2453 tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
2454 } else {
2455 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
2456 Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
2457 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
2458 }
2459 emit_wqm(ctx, tmp, dst, true);
2460 break;
2461 }
2462 default:
2463 fprintf(stderr, "Unknown NIR ALU instr: ");
2464 nir_print_instr(&instr->instr, stderr);
2465 fprintf(stderr, "\n");
2466 }
2467 }
2468
2469 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2470 {
2471 Temp dst = get_ssa_temp(ctx, &instr->def);
2472
2473 // TODO: we really want to have the resulting type as this would allow for 64bit literals
2474 // which get truncated the lsb if double and msb if int
2475 // for now, we only use s_mov_b64 with 64bit inline constants
2476 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2477 assert(dst.type() == RegType::sgpr);
2478
2479 Builder bld(ctx->program, ctx->block);
2480
2481 if (instr->def.bit_size == 1) {
2482 assert(dst.regClass() == bld.lm);
2483 int val = instr->value[0].b ? -1 : 0;
2484 Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
2485 bld.sop1(Builder::s_mov, Definition(dst), op);
2486 } else if (dst.size() == 1) {
2487 bld.copy(Definition(dst), Operand(instr->value[0].u32));
2488 } else {
2489 assert(dst.size() != 1);
2490 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2491 if (instr->def.bit_size == 64)
2492 for (unsigned i = 0; i < dst.size(); i++)
2493 vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2494 else {
2495 for (unsigned i = 0; i < dst.size(); i++)
2496 vec->operands[i] = Operand{instr->value[i].u32};
2497 }
2498 vec->definitions[0] = Definition(dst);
2499 ctx->block->instructions.emplace_back(std::move(vec));
2500 }
2501 }
2502
2503 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2504 {
2505 uint32_t new_mask = 0;
2506 for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2507 if (mask & (1u << i))
2508 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2509 return new_mask;
2510 }
2511
2512 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2513 {
2514 /* This wouldn't work inside control flow or with indirect offsets but
2515 * that doesn't happen because of nir_lower_io_to_temporaries(). */
2516
2517 unsigned write_mask = nir_intrinsic_write_mask(instr);
2518 unsigned component = nir_intrinsic_component(instr);
2519 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2520 unsigned idx = nir_intrinsic_base(instr) + component;
2521
2522 nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2523 if (off_instr->type != nir_instr_type_load_const) {
2524 fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2525 nir_print_instr(off_instr, stderr);
2526 fprintf(stderr, "\n");
2527 }
2528 idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2529
2530 if (instr->src[0].ssa->bit_size == 64)
2531 write_mask = widen_mask(write_mask, 2);
2532
2533 for (unsigned i = 0; i < 8; ++i) {
2534 if (write_mask & (1 << i)) {
2535 ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2536 ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2537 }
2538 idx++;
2539 }
2540 }
2541
2542 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2543 {
2544 Builder bld(ctx->program, ctx->block);
2545 unsigned write_mask = nir_intrinsic_write_mask(instr);
2546 Operand values[4];
2547 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2548 for (unsigned i = 0; i < 4; ++i) {
2549 if (write_mask & (1 << i)) {
2550 Temp tmp = emit_extract_vector(ctx, src, i, v1);
2551 values[i] = Operand(tmp);
2552 } else {
2553 values[i] = Operand(v1);
2554 }
2555 }
2556
2557 unsigned index = nir_intrinsic_base(instr) / 4;
2558 unsigned target, col_format;
2559 unsigned enabled_channels = 0xF;
2560 aco_opcode compr_op = (aco_opcode)0;
2561
2562 nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2563 assert(offset && "Non-const offsets on exports not yet supported");
2564 index += offset->u32;
2565
2566 assert(index != FRAG_RESULT_COLOR);
2567
2568 /* Unlike vertex shader exports, it's fine to use multiple exports to
2569 * export separate channels of one target. So shaders which export both
2570 * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2571 * TODO: combine the exports in those cases and create better code
2572 */
2573
2574 if (index == FRAG_RESULT_SAMPLE_MASK) {
2575
2576 if (ctx->program->info->ps.writes_z) {
2577 target = V_008DFC_SQ_EXP_MRTZ;
2578 enabled_channels = 0x4;
2579 col_format = (unsigned) -1;
2580
2581 values[2] = values[0];
2582 values[0] = Operand(v1);
2583 } else {
2584 bld.exp(aco_opcode::exp, Operand(v1), Operand(values[0]), Operand(v1), Operand(v1),
2585 0xc, V_008DFC_SQ_EXP_MRTZ, true);
2586 return;
2587 }
2588
2589 } else if (index == FRAG_RESULT_DEPTH) {
2590
2591 target = V_008DFC_SQ_EXP_MRTZ;
2592 enabled_channels = 0x1;
2593 col_format = (unsigned) -1;
2594
2595 } else if (index == FRAG_RESULT_STENCIL) {
2596
2597 if (ctx->program->info->ps.writes_z) {
2598 target = V_008DFC_SQ_EXP_MRTZ;
2599 enabled_channels = 0x2;
2600 col_format = (unsigned) -1;
2601
2602 values[1] = values[0];
2603 values[0] = Operand(v1);
2604 } else {
2605 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
2606 bld.exp(aco_opcode::exp, values[0], Operand(v1), Operand(v1), Operand(v1),
2607 0x3, V_008DFC_SQ_EXP_MRTZ, true);
2608 return;
2609 }
2610