nir: Add a new memory_barrier_tcs_patch intrinsic
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <map>
29
30 #include "ac_shader_util.h"
31 #include "aco_ir.h"
32 #include "aco_builder.h"
33 #include "aco_interface.h"
34 #include "aco_instruction_selection_setup.cpp"
35 #include "util/fast_idiv_by_const.h"
36
37 namespace aco {
38 namespace {
39
40 class loop_info_RAII {
41 isel_context* ctx;
42 unsigned header_idx_old;
43 Block* exit_old;
44 bool divergent_cont_old;
45 bool divergent_branch_old;
46 bool divergent_if_old;
47
48 public:
49 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
50 : ctx(ctx),
51 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
52 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
53 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
54 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
55 {
56 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
57 ctx->cf_info.parent_loop.exit = loop_exit;
58 ctx->cf_info.parent_loop.has_divergent_continue = false;
59 ctx->cf_info.parent_loop.has_divergent_branch = false;
60 ctx->cf_info.parent_if.is_divergent = false;
61 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
62 }
63
64 ~loop_info_RAII()
65 {
66 ctx->cf_info.parent_loop.header_idx = header_idx_old;
67 ctx->cf_info.parent_loop.exit = exit_old;
68 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
69 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
70 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
71 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
72 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
73 ctx->cf_info.exec_potentially_empty = false;
74 }
75 };
76
77 struct if_context {
78 Temp cond;
79
80 bool divergent_old;
81 bool exec_potentially_empty_old;
82
83 unsigned BB_if_idx;
84 unsigned invert_idx;
85 bool then_branch_divergent;
86 Block BB_invert;
87 Block BB_endif;
88 };
89
90 static void visit_cf_list(struct isel_context *ctx,
91 struct exec_list *list);
92
93 static void add_logical_edge(unsigned pred_idx, Block *succ)
94 {
95 succ->logical_preds.emplace_back(pred_idx);
96 }
97
98
99 static void add_linear_edge(unsigned pred_idx, Block *succ)
100 {
101 succ->linear_preds.emplace_back(pred_idx);
102 }
103
104 static void add_edge(unsigned pred_idx, Block *succ)
105 {
106 add_logical_edge(pred_idx, succ);
107 add_linear_edge(pred_idx, succ);
108 }
109
110 static void append_logical_start(Block *b)
111 {
112 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
113 }
114
115 static void append_logical_end(Block *b)
116 {
117 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
118 }
119
120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
121 {
122 assert(ctx->allocated[def->index].id());
123 return ctx->allocated[def->index];
124 }
125
126 Temp emit_mbcnt(isel_context *ctx, Definition dst,
127 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
128 {
129 Builder bld(ctx->program, ctx->block);
130 Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
131 Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
132
133 if (ctx->program->wave_size == 32) {
134 return thread_id_lo;
135 } else {
136 Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
137 return thread_id_hi;
138 }
139 }
140
141 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
142 {
143 Builder bld(ctx->program, ctx->block);
144
145 if (!dst.id())
146 dst = bld.tmp(src.regClass());
147
148 assert(src.size() == dst.size());
149
150 if (ctx->stage != fragment_fs) {
151 if (!dst.id())
152 return src;
153
154 bld.copy(Definition(dst), src);
155 return dst;
156 }
157
158 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
159 ctx->program->needs_wqm |= program_needs_wqm;
160 return dst;
161 }
162
163 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
164 {
165 if (index.regClass() == s1)
166 return bld.readlane(bld.def(s1), data, index);
167
168 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
169
170 /* Currently not implemented on GFX6-7 */
171 assert(ctx->options->chip_class >= GFX8);
172
173 if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
174 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
175 }
176
177 /* GFX10, wave64 mode:
178 * The bpermute instruction is limited to half-wave operation, which means that it can't
179 * properly support subgroup shuffle like older generations (or wave32 mode), so we
180 * emulate it here.
181 */
182 if (!ctx->has_gfx10_wave64_bpermute) {
183 ctx->has_gfx10_wave64_bpermute = true;
184 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
185 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
186 }
187
188 Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
189 Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
190 Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
191 Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
192
193 return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
194 bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
195 }
196
197 Temp as_vgpr(isel_context *ctx, Temp val)
198 {
199 if (val.type() == RegType::sgpr) {
200 Builder bld(ctx->program, ctx->block);
201 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
202 }
203 assert(val.type() == RegType::vgpr);
204 return val;
205 }
206
207 //assumes a != 0xffffffff
208 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
209 {
210 assert(b != 0);
211 Builder bld(ctx->program, ctx->block);
212
213 if (util_is_power_of_two_or_zero(b)) {
214 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
215 return;
216 }
217
218 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
219
220 assert(info.multiplier <= 0xffffffff);
221
222 bool pre_shift = info.pre_shift != 0;
223 bool increment = info.increment != 0;
224 bool multiply = true;
225 bool post_shift = info.post_shift != 0;
226
227 if (!pre_shift && !increment && !multiply && !post_shift) {
228 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
229 return;
230 }
231
232 Temp pre_shift_dst = a;
233 if (pre_shift) {
234 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
235 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
236 }
237
238 Temp increment_dst = pre_shift_dst;
239 if (increment) {
240 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
241 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
242 }
243
244 Temp multiply_dst = increment_dst;
245 if (multiply) {
246 multiply_dst = post_shift ? bld.tmp(v1) : dst;
247 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
248 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
249 }
250
251 if (post_shift) {
252 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
253 }
254 }
255
256 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
257 {
258 Builder bld(ctx->program, ctx->block);
259 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
260 }
261
262
263 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
264 {
265 /* no need to extract the whole vector */
266 if (src.regClass() == dst_rc) {
267 assert(idx == 0);
268 return src;
269 }
270 assert(src.size() > idx);
271 Builder bld(ctx->program, ctx->block);
272 auto it = ctx->allocated_vec.find(src.id());
273 /* the size check needs to be early because elements other than 0 may be garbage */
274 if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
275 if (it->second[idx].regClass() == dst_rc) {
276 return it->second[idx];
277 } else {
278 assert(dst_rc.size() == it->second[idx].regClass().size());
279 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
280 return bld.copy(bld.def(dst_rc), it->second[idx]);
281 }
282 }
283
284 if (src.size() == dst_rc.size()) {
285 assert(idx == 0);
286 return bld.copy(bld.def(dst_rc), src);
287 } else {
288 Temp dst = bld.tmp(dst_rc);
289 emit_extract_vector(ctx, src, idx, dst);
290 return dst;
291 }
292 }
293
294 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
295 {
296 if (num_components == 1)
297 return;
298 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
299 return;
300 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
301 split->operands[0] = Operand(vec_src);
302 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
303 for (unsigned i = 0; i < num_components; i++) {
304 elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
305 split->definitions[i] = Definition(elems[i]);
306 }
307 ctx->block->instructions.emplace_back(std::move(split));
308 ctx->allocated_vec.emplace(vec_src.id(), elems);
309 }
310
311 /* This vector expansion uses a mask to determine which elements in the new vector
312 * come from the original vector. The other elements are undefined. */
313 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
314 {
315 emit_split_vector(ctx, vec_src, util_bitcount(mask));
316
317 if (vec_src == dst)
318 return;
319
320 Builder bld(ctx->program, ctx->block);
321 if (num_components == 1) {
322 if (dst.type() == RegType::sgpr)
323 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
324 else
325 bld.copy(Definition(dst), vec_src);
326 return;
327 }
328
329 unsigned component_size = dst.size() / num_components;
330 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
331
332 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
333 vec->definitions[0] = Definition(dst);
334 unsigned k = 0;
335 for (unsigned i = 0; i < num_components; i++) {
336 if (mask & (1 << i)) {
337 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
338 if (dst.type() == RegType::sgpr)
339 src = bld.as_uniform(src);
340 vec->operands[i] = Operand(src);
341 } else {
342 vec->operands[i] = Operand(0u);
343 }
344 elems[i] = vec->operands[i].getTemp();
345 }
346 ctx->block->instructions.emplace_back(std::move(vec));
347 ctx->allocated_vec.emplace(dst.id(), elems);
348 }
349
350 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
351 {
352 Builder bld(ctx->program, ctx->block);
353 if (!dst.id())
354 dst = bld.tmp(bld.lm);
355
356 assert(val.regClass() == s1);
357 assert(dst.regClass() == bld.lm);
358
359 return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
360 }
361
362 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
363 {
364 Builder bld(ctx->program, ctx->block);
365 if (!dst.id())
366 dst = bld.tmp(s1);
367
368 assert(val.regClass() == bld.lm);
369 assert(dst.regClass() == s1);
370
371 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
372 Temp tmp = bld.tmp(s1);
373 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
374 return emit_wqm(ctx, tmp, dst);
375 }
376
377 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
378 {
379 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
380 return get_ssa_temp(ctx, src.src.ssa);
381
382 if (src.src.ssa->num_components == size) {
383 bool identity_swizzle = true;
384 for (unsigned i = 0; identity_swizzle && i < size; i++) {
385 if (src.swizzle[i] != i)
386 identity_swizzle = false;
387 }
388 if (identity_swizzle)
389 return get_ssa_temp(ctx, src.src.ssa);
390 }
391
392 Temp vec = get_ssa_temp(ctx, src.src.ssa);
393 unsigned elem_size = vec.size() / src.src.ssa->num_components;
394 assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
395 assert(vec.size() % elem_size == 0);
396
397 RegClass elem_rc = RegClass(vec.type(), elem_size);
398 if (size == 1) {
399 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
400 } else {
401 assert(size <= 4);
402 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
403 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
404 for (unsigned i = 0; i < size; ++i) {
405 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
406 vec_instr->operands[i] = Operand{elems[i]};
407 }
408 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
409 vec_instr->definitions[0] = Definition(dst);
410 ctx->block->instructions.emplace_back(std::move(vec_instr));
411 ctx->allocated_vec.emplace(dst.id(), elems);
412 return dst;
413 }
414 }
415
416 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
417 {
418 if (ptr.size() == 2)
419 return ptr;
420 Builder bld(ctx->program, ctx->block);
421 if (ptr.type() == RegType::vgpr)
422 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
423 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
424 ptr, Operand((unsigned)ctx->options->address32_hi));
425 }
426
427 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
428 {
429 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
430 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
431 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
432 sop2->definitions[0] = Definition(dst);
433 if (writes_scc)
434 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
435 ctx->block->instructions.emplace_back(std::move(sop2));
436 }
437
438 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
439 bool commutative, bool swap_srcs=false, bool flush_denorms = false)
440 {
441 Builder bld(ctx->program, ctx->block);
442 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
443 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
444 if (src1.type() == RegType::sgpr) {
445 if (commutative && src0.type() == RegType::vgpr) {
446 Temp t = src0;
447 src0 = src1;
448 src1 = t;
449 } else if (src0.type() == RegType::vgpr &&
450 op != aco_opcode::v_madmk_f32 &&
451 op != aco_opcode::v_madak_f32 &&
452 op != aco_opcode::v_madmk_f16 &&
453 op != aco_opcode::v_madak_f16) {
454 /* If the instruction is not commutative, we emit a VOP3A instruction */
455 bld.vop2_e64(op, Definition(dst), src0, src1);
456 return;
457 } else {
458 src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
459 }
460 }
461
462 if (flush_denorms && ctx->program->chip_class < GFX9) {
463 assert(dst.size() == 1);
464 Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
465 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
466 } else {
467 bld.vop2(op, Definition(dst), src0, src1);
468 }
469 }
470
471 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
472 bool flush_denorms = false)
473 {
474 Temp src0 = get_alu_src(ctx, instr->src[0]);
475 Temp src1 = get_alu_src(ctx, instr->src[1]);
476 Temp src2 = get_alu_src(ctx, instr->src[2]);
477
478 /* ensure that the instruction has at most 1 sgpr operand
479 * The optimizer will inline constants for us */
480 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
481 src0 = as_vgpr(ctx, src0);
482 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
483 src1 = as_vgpr(ctx, src1);
484 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
485 src2 = as_vgpr(ctx, src2);
486
487 Builder bld(ctx->program, ctx->block);
488 if (flush_denorms && ctx->program->chip_class < GFX9) {
489 assert(dst.size() == 1);
490 Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
491 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
492 } else {
493 bld.vop3(op, Definition(dst), src0, src1, src2);
494 }
495 }
496
497 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
498 {
499 Builder bld(ctx->program, ctx->block);
500 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
501 }
502
503 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
504 {
505 Temp src0 = get_alu_src(ctx, instr->src[0]);
506 Temp src1 = get_alu_src(ctx, instr->src[1]);
507 assert(src0.size() == src1.size());
508
509 aco_ptr<Instruction> vopc;
510 if (src1.type() == RegType::sgpr) {
511 if (src0.type() == RegType::vgpr) {
512 /* to swap the operands, we might also have to change the opcode */
513 switch (op) {
514 case aco_opcode::v_cmp_lt_f32:
515 op = aco_opcode::v_cmp_gt_f32;
516 break;
517 case aco_opcode::v_cmp_ge_f32:
518 op = aco_opcode::v_cmp_le_f32;
519 break;
520 case aco_opcode::v_cmp_lt_i32:
521 op = aco_opcode::v_cmp_gt_i32;
522 break;
523 case aco_opcode::v_cmp_ge_i32:
524 op = aco_opcode::v_cmp_le_i32;
525 break;
526 case aco_opcode::v_cmp_lt_u32:
527 op = aco_opcode::v_cmp_gt_u32;
528 break;
529 case aco_opcode::v_cmp_ge_u32:
530 op = aco_opcode::v_cmp_le_u32;
531 break;
532 case aco_opcode::v_cmp_lt_f64:
533 op = aco_opcode::v_cmp_gt_f64;
534 break;
535 case aco_opcode::v_cmp_ge_f64:
536 op = aco_opcode::v_cmp_le_f64;
537 break;
538 case aco_opcode::v_cmp_lt_i64:
539 op = aco_opcode::v_cmp_gt_i64;
540 break;
541 case aco_opcode::v_cmp_ge_i64:
542 op = aco_opcode::v_cmp_le_i64;
543 break;
544 case aco_opcode::v_cmp_lt_u64:
545 op = aco_opcode::v_cmp_gt_u64;
546 break;
547 case aco_opcode::v_cmp_ge_u64:
548 op = aco_opcode::v_cmp_le_u64;
549 break;
550 default: /* eq and ne are commutative */
551 break;
552 }
553 Temp t = src0;
554 src0 = src1;
555 src1 = t;
556 } else {
557 src1 = as_vgpr(ctx, src1);
558 }
559 }
560
561 Builder bld(ctx->program, ctx->block);
562 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
563 }
564
565 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
566 {
567 Temp src0 = get_alu_src(ctx, instr->src[0]);
568 Temp src1 = get_alu_src(ctx, instr->src[1]);
569 Builder bld(ctx->program, ctx->block);
570
571 assert(dst.regClass() == bld.lm);
572 assert(src0.type() == RegType::sgpr);
573 assert(src1.type() == RegType::sgpr);
574 assert(src0.regClass() == src1.regClass());
575
576 /* Emit the SALU comparison instruction */
577 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
578 /* Turn the result into a per-lane bool */
579 bool_to_vector_condition(ctx, cmp, dst);
580 }
581
582 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
583 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
584 {
585 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
586 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
587 bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
588 bool use_valu = s_op == aco_opcode::num_opcodes ||
589 divergent_vals ||
590 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
591 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
592 aco_opcode op = use_valu ? v_op : s_op;
593 assert(op != aco_opcode::num_opcodes);
594 assert(dst.regClass() == ctx->program->lane_mask);
595
596 if (use_valu)
597 emit_vopc_instruction(ctx, instr, op, dst);
598 else
599 emit_sopc_instruction(ctx, instr, op, dst);
600 }
601
602 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
603 {
604 Builder bld(ctx->program, ctx->block);
605 Temp src0 = get_alu_src(ctx, instr->src[0]);
606 Temp src1 = get_alu_src(ctx, instr->src[1]);
607
608 assert(dst.regClass() == bld.lm);
609 assert(src0.regClass() == bld.lm);
610 assert(src1.regClass() == bld.lm);
611
612 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
613 }
614
615 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
616 {
617 Builder bld(ctx->program, ctx->block);
618 Temp cond = get_alu_src(ctx, instr->src[0]);
619 Temp then = get_alu_src(ctx, instr->src[1]);
620 Temp els = get_alu_src(ctx, instr->src[2]);
621
622 assert(cond.regClass() == bld.lm);
623
624 if (dst.type() == RegType::vgpr) {
625 aco_ptr<Instruction> bcsel;
626 if (dst.size() == 1) {
627 then = as_vgpr(ctx, then);
628 els = as_vgpr(ctx, els);
629
630 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
631 } else if (dst.size() == 2) {
632 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
633 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
634 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
635 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
636
637 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
638 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
639
640 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
641 } else {
642 fprintf(stderr, "Unimplemented NIR instr bit size: ");
643 nir_print_instr(&instr->instr, stderr);
644 fprintf(stderr, "\n");
645 }
646 return;
647 }
648
649 if (instr->dest.dest.ssa.bit_size == 1) {
650 assert(dst.regClass() == bld.lm);
651 assert(then.regClass() == bld.lm);
652 assert(els.regClass() == bld.lm);
653 }
654
655 if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
656 if (dst.regClass() == s1 || dst.regClass() == s2) {
657 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
658 assert(dst.size() == then.size());
659 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
660 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
661 } else {
662 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
663 nir_print_instr(&instr->instr, stderr);
664 fprintf(stderr, "\n");
665 }
666 return;
667 }
668
669 /* divergent boolean bcsel
670 * this implements bcsel on bools: dst = s0 ? s1 : s2
671 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
672 assert(instr->dest.dest.ssa.bit_size == 1);
673
674 if (cond.id() != then.id())
675 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
676
677 if (cond.id() == els.id())
678 bld.sop1(Builder::s_mov, Definition(dst), then);
679 else
680 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
681 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
682 }
683
684 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
685 aco_opcode op, uint32_t undo)
686 {
687 /* multiply by 16777216 to handle denormals */
688 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
689 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
690 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
691 scaled = bld.vop1(op, bld.def(v1), scaled);
692 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
693
694 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
695
696 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
697 }
698
699 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
700 {
701 if (ctx->block->fp_mode.denorm32 == 0) {
702 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
703 return;
704 }
705
706 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
707 }
708
709 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
710 {
711 if (ctx->block->fp_mode.denorm32 == 0) {
712 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
713 return;
714 }
715
716 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
717 }
718
719 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
720 {
721 if (ctx->block->fp_mode.denorm32 == 0) {
722 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
723 return;
724 }
725
726 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
727 }
728
729 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
730 {
731 if (ctx->block->fp_mode.denorm32 == 0) {
732 bld.vop1(aco_opcode::v_log_f32, dst, val);
733 return;
734 }
735
736 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
737 }
738
739 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
740 {
741 if (!instr->dest.dest.is_ssa) {
742 fprintf(stderr, "nir alu dst not in ssa: ");
743 nir_print_instr(&instr->instr, stderr);
744 fprintf(stderr, "\n");
745 abort();
746 }
747 Builder bld(ctx->program, ctx->block);
748 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
749 switch(instr->op) {
750 case nir_op_vec2:
751 case nir_op_vec3:
752 case nir_op_vec4: {
753 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
754 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
755 for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
756 elems[i] = get_alu_src(ctx, instr->src[i]);
757 vec->operands[i] = Operand{elems[i]};
758 }
759 vec->definitions[0] = Definition(dst);
760 ctx->block->instructions.emplace_back(std::move(vec));
761 ctx->allocated_vec.emplace(dst.id(), elems);
762 break;
763 }
764 case nir_op_mov: {
765 Temp src = get_alu_src(ctx, instr->src[0]);
766 aco_ptr<Instruction> mov;
767 if (dst.type() == RegType::sgpr) {
768 if (src.type() == RegType::vgpr)
769 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
770 else if (src.regClass() == s1)
771 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
772 else if (src.regClass() == s2)
773 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
774 else
775 unreachable("wrong src register class for nir_op_imov");
776 } else if (dst.regClass() == v1) {
777 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
778 } else if (dst.regClass() == v2) {
779 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
780 } else {
781 nir_print_instr(&instr->instr, stderr);
782 unreachable("Should have been lowered to scalar.");
783 }
784 break;
785 }
786 case nir_op_inot: {
787 Temp src = get_alu_src(ctx, instr->src[0]);
788 if (instr->dest.dest.ssa.bit_size == 1) {
789 assert(src.regClass() == bld.lm);
790 assert(dst.regClass() == bld.lm);
791 bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src);
792 } else if (dst.regClass() == v1) {
793 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
794 } else if (dst.type() == RegType::sgpr) {
795 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
796 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
797 } else {
798 fprintf(stderr, "Unimplemented NIR instr bit size: ");
799 nir_print_instr(&instr->instr, stderr);
800 fprintf(stderr, "\n");
801 }
802 break;
803 }
804 case nir_op_ineg: {
805 Temp src = get_alu_src(ctx, instr->src[0]);
806 if (dst.regClass() == v1) {
807 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
808 } else if (dst.regClass() == s1) {
809 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
810 } else if (dst.size() == 2) {
811 Temp src0 = bld.tmp(dst.type(), 1);
812 Temp src1 = bld.tmp(dst.type(), 1);
813 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
814
815 if (dst.regClass() == s2) {
816 Temp carry = bld.tmp(s1);
817 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
818 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
819 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
820 } else {
821 Temp lower = bld.tmp(v1);
822 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
823 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
824 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
825 }
826 } else {
827 fprintf(stderr, "Unimplemented NIR instr bit size: ");
828 nir_print_instr(&instr->instr, stderr);
829 fprintf(stderr, "\n");
830 }
831 break;
832 }
833 case nir_op_iabs: {
834 if (dst.regClass() == s1) {
835 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
836 } else if (dst.regClass() == v1) {
837 Temp src = get_alu_src(ctx, instr->src[0]);
838 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
839 } else {
840 fprintf(stderr, "Unimplemented NIR instr bit size: ");
841 nir_print_instr(&instr->instr, stderr);
842 fprintf(stderr, "\n");
843 }
844 break;
845 }
846 case nir_op_isign: {
847 Temp src = get_alu_src(ctx, instr->src[0]);
848 if (dst.regClass() == s1) {
849 Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
850 Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
851 bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
852 } else if (dst.regClass() == s2) {
853 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
854 Temp neqz;
855 if (ctx->program->chip_class >= GFX8)
856 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
857 else
858 neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
859 /* SCC gets zero-extended to 64 bit */
860 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
861 } else if (dst.regClass() == v1) {
862 Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
863 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
864 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
865 } else if (dst.regClass() == v2) {
866 Temp upper = emit_extract_vector(ctx, src, 1, v1);
867 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
868 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
869 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
870 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
871 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
872 } else {
873 fprintf(stderr, "Unimplemented NIR instr bit size: ");
874 nir_print_instr(&instr->instr, stderr);
875 fprintf(stderr, "\n");
876 }
877 break;
878 }
879 case nir_op_imax: {
880 if (dst.regClass() == v1) {
881 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
882 } else if (dst.regClass() == s1) {
883 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
884 } else {
885 fprintf(stderr, "Unimplemented NIR instr bit size: ");
886 nir_print_instr(&instr->instr, stderr);
887 fprintf(stderr, "\n");
888 }
889 break;
890 }
891 case nir_op_umax: {
892 if (dst.regClass() == v1) {
893 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
894 } else if (dst.regClass() == s1) {
895 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
896 } else {
897 fprintf(stderr, "Unimplemented NIR instr bit size: ");
898 nir_print_instr(&instr->instr, stderr);
899 fprintf(stderr, "\n");
900 }
901 break;
902 }
903 case nir_op_imin: {
904 if (dst.regClass() == v1) {
905 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
906 } else if (dst.regClass() == s1) {
907 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
908 } else {
909 fprintf(stderr, "Unimplemented NIR instr bit size: ");
910 nir_print_instr(&instr->instr, stderr);
911 fprintf(stderr, "\n");
912 }
913 break;
914 }
915 case nir_op_umin: {
916 if (dst.regClass() == v1) {
917 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
918 } else if (dst.regClass() == s1) {
919 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
920 } else {
921 fprintf(stderr, "Unimplemented NIR instr bit size: ");
922 nir_print_instr(&instr->instr, stderr);
923 fprintf(stderr, "\n");
924 }
925 break;
926 }
927 case nir_op_ior: {
928 if (instr->dest.dest.ssa.bit_size == 1) {
929 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
930 } else if (dst.regClass() == v1) {
931 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
932 } else if (dst.regClass() == s1) {
933 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
934 } else if (dst.regClass() == s2) {
935 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
936 } else {
937 fprintf(stderr, "Unimplemented NIR instr bit size: ");
938 nir_print_instr(&instr->instr, stderr);
939 fprintf(stderr, "\n");
940 }
941 break;
942 }
943 case nir_op_iand: {
944 if (instr->dest.dest.ssa.bit_size == 1) {
945 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
946 } else if (dst.regClass() == v1) {
947 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
948 } else if (dst.regClass() == s1) {
949 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
950 } else if (dst.regClass() == s2) {
951 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
952 } else {
953 fprintf(stderr, "Unimplemented NIR instr bit size: ");
954 nir_print_instr(&instr->instr, stderr);
955 fprintf(stderr, "\n");
956 }
957 break;
958 }
959 case nir_op_ixor: {
960 if (instr->dest.dest.ssa.bit_size == 1) {
961 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
962 } else if (dst.regClass() == v1) {
963 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
964 } else if (dst.regClass() == s1) {
965 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
966 } else if (dst.regClass() == s2) {
967 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
968 } else {
969 fprintf(stderr, "Unimplemented NIR instr bit size: ");
970 nir_print_instr(&instr->instr, stderr);
971 fprintf(stderr, "\n");
972 }
973 break;
974 }
975 case nir_op_ushr: {
976 if (dst.regClass() == v1) {
977 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
978 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
979 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
980 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
981 } else if (dst.regClass() == v2) {
982 bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
983 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
984 } else if (dst.regClass() == s2) {
985 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
986 } else if (dst.regClass() == s1) {
987 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
988 } else {
989 fprintf(stderr, "Unimplemented NIR instr bit size: ");
990 nir_print_instr(&instr->instr, stderr);
991 fprintf(stderr, "\n");
992 }
993 break;
994 }
995 case nir_op_ishl: {
996 if (dst.regClass() == v1) {
997 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
998 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
999 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1000 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1001 } else if (dst.regClass() == v2) {
1002 bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1003 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1004 } else if (dst.regClass() == s1) {
1005 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1006 } else if (dst.regClass() == s2) {
1007 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1008 } else {
1009 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1010 nir_print_instr(&instr->instr, stderr);
1011 fprintf(stderr, "\n");
1012 }
1013 break;
1014 }
1015 case nir_op_ishr: {
1016 if (dst.regClass() == v1) {
1017 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1018 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1019 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1020 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1021 } else if (dst.regClass() == v2) {
1022 bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1023 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1024 } else if (dst.regClass() == s1) {
1025 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1026 } else if (dst.regClass() == s2) {
1027 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1028 } else {
1029 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1030 nir_print_instr(&instr->instr, stderr);
1031 fprintf(stderr, "\n");
1032 }
1033 break;
1034 }
1035 case nir_op_find_lsb: {
1036 Temp src = get_alu_src(ctx, instr->src[0]);
1037 if (src.regClass() == s1) {
1038 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1039 } else if (src.regClass() == v1) {
1040 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1041 } else if (src.regClass() == s2) {
1042 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1043 } else {
1044 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1045 nir_print_instr(&instr->instr, stderr);
1046 fprintf(stderr, "\n");
1047 }
1048 break;
1049 }
1050 case nir_op_ufind_msb:
1051 case nir_op_ifind_msb: {
1052 Temp src = get_alu_src(ctx, instr->src[0]);
1053 if (src.regClass() == s1 || src.regClass() == s2) {
1054 aco_opcode op = src.regClass() == s2 ?
1055 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1056 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1057 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1058
1059 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1060 Operand(src.size() * 32u - 1u), msb_rev);
1061 Temp msb = sub.def(0).getTemp();
1062 Temp carry = sub.def(1).getTemp();
1063
1064 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
1065 } else if (src.regClass() == v1) {
1066 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1067 Temp msb_rev = bld.tmp(v1);
1068 emit_vop1_instruction(ctx, instr, op, msb_rev);
1069 Temp msb = bld.tmp(v1);
1070 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1071 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1072 } else {
1073 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1074 nir_print_instr(&instr->instr, stderr);
1075 fprintf(stderr, "\n");
1076 }
1077 break;
1078 }
1079 case nir_op_bitfield_reverse: {
1080 if (dst.regClass() == s1) {
1081 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1082 } else if (dst.regClass() == v1) {
1083 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1084 } else {
1085 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1086 nir_print_instr(&instr->instr, stderr);
1087 fprintf(stderr, "\n");
1088 }
1089 break;
1090 }
1091 case nir_op_iadd: {
1092 if (dst.regClass() == s1) {
1093 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1094 break;
1095 }
1096
1097 Temp src0 = get_alu_src(ctx, instr->src[0]);
1098 Temp src1 = get_alu_src(ctx, instr->src[1]);
1099 if (dst.regClass() == v1) {
1100 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1101 break;
1102 }
1103
1104 assert(src0.size() == 2 && src1.size() == 2);
1105 Temp src00 = bld.tmp(src0.type(), 1);
1106 Temp src01 = bld.tmp(dst.type(), 1);
1107 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1108 Temp src10 = bld.tmp(src1.type(), 1);
1109 Temp src11 = bld.tmp(dst.type(), 1);
1110 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1111
1112 if (dst.regClass() == s2) {
1113 Temp carry = bld.tmp(s1);
1114 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1115 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1116 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1117 } else if (dst.regClass() == v2) {
1118 Temp dst0 = bld.tmp(v1);
1119 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1120 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1121 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1122 } else {
1123 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1124 nir_print_instr(&instr->instr, stderr);
1125 fprintf(stderr, "\n");
1126 }
1127 break;
1128 }
1129 case nir_op_uadd_sat: {
1130 Temp src0 = get_alu_src(ctx, instr->src[0]);
1131 Temp src1 = get_alu_src(ctx, instr->src[1]);
1132 if (dst.regClass() == s1) {
1133 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1134 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1135 src0, src1);
1136 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1137 } else if (dst.regClass() == v1) {
1138 if (ctx->options->chip_class >= GFX9) {
1139 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1140 add->operands[0] = Operand(src0);
1141 add->operands[1] = Operand(src1);
1142 add->definitions[0] = Definition(dst);
1143 add->clamp = 1;
1144 ctx->block->instructions.emplace_back(std::move(add));
1145 } else {
1146 if (src1.regClass() != v1)
1147 std::swap(src0, src1);
1148 assert(src1.regClass() == v1);
1149 Temp tmp = bld.tmp(v1);
1150 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1151 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1152 }
1153 } else {
1154 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1155 nir_print_instr(&instr->instr, stderr);
1156 fprintf(stderr, "\n");
1157 }
1158 break;
1159 }
1160 case nir_op_uadd_carry: {
1161 Temp src0 = get_alu_src(ctx, instr->src[0]);
1162 Temp src1 = get_alu_src(ctx, instr->src[1]);
1163 if (dst.regClass() == s1) {
1164 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1165 break;
1166 }
1167 if (dst.regClass() == v1) {
1168 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1169 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1170 break;
1171 }
1172
1173 Temp src00 = bld.tmp(src0.type(), 1);
1174 Temp src01 = bld.tmp(dst.type(), 1);
1175 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1176 Temp src10 = bld.tmp(src1.type(), 1);
1177 Temp src11 = bld.tmp(dst.type(), 1);
1178 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1179 if (dst.regClass() == s2) {
1180 Temp carry = bld.tmp(s1);
1181 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1182 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1183 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1184 } else if (dst.regClass() == v2) {
1185 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1186 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1187 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1188 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1189 } else {
1190 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1191 nir_print_instr(&instr->instr, stderr);
1192 fprintf(stderr, "\n");
1193 }
1194 break;
1195 }
1196 case nir_op_isub: {
1197 if (dst.regClass() == s1) {
1198 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1199 break;
1200 }
1201
1202 Temp src0 = get_alu_src(ctx, instr->src[0]);
1203 Temp src1 = get_alu_src(ctx, instr->src[1]);
1204 if (dst.regClass() == v1) {
1205 bld.vsub32(Definition(dst), src0, src1);
1206 break;
1207 }
1208
1209 Temp src00 = bld.tmp(src0.type(), 1);
1210 Temp src01 = bld.tmp(dst.type(), 1);
1211 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1212 Temp src10 = bld.tmp(src1.type(), 1);
1213 Temp src11 = bld.tmp(dst.type(), 1);
1214 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1215 if (dst.regClass() == s2) {
1216 Temp carry = bld.tmp(s1);
1217 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1218 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1219 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1220 } else if (dst.regClass() == v2) {
1221 Temp lower = bld.tmp(v1);
1222 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1223 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1224 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1225 } else {
1226 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1227 nir_print_instr(&instr->instr, stderr);
1228 fprintf(stderr, "\n");
1229 }
1230 break;
1231 }
1232 case nir_op_usub_borrow: {
1233 Temp src0 = get_alu_src(ctx, instr->src[0]);
1234 Temp src1 = get_alu_src(ctx, instr->src[1]);
1235 if (dst.regClass() == s1) {
1236 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1237 break;
1238 } else if (dst.regClass() == v1) {
1239 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1240 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1241 break;
1242 }
1243
1244 Temp src00 = bld.tmp(src0.type(), 1);
1245 Temp src01 = bld.tmp(dst.type(), 1);
1246 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1247 Temp src10 = bld.tmp(src1.type(), 1);
1248 Temp src11 = bld.tmp(dst.type(), 1);
1249 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1250 if (dst.regClass() == s2) {
1251 Temp borrow = bld.tmp(s1);
1252 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1253 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1254 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1255 } else if (dst.regClass() == v2) {
1256 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1257 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1258 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1259 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1260 } else {
1261 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1262 nir_print_instr(&instr->instr, stderr);
1263 fprintf(stderr, "\n");
1264 }
1265 break;
1266 }
1267 case nir_op_imul: {
1268 if (dst.regClass() == v1) {
1269 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1270 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1271 } else if (dst.regClass() == s1) {
1272 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1273 } else {
1274 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1275 nir_print_instr(&instr->instr, stderr);
1276 fprintf(stderr, "\n");
1277 }
1278 break;
1279 }
1280 case nir_op_umul_high: {
1281 if (dst.regClass() == v1) {
1282 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1283 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1284 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1285 } else if (dst.regClass() == s1) {
1286 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1287 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1288 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1289 } else {
1290 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1291 nir_print_instr(&instr->instr, stderr);
1292 fprintf(stderr, "\n");
1293 }
1294 break;
1295 }
1296 case nir_op_imul_high: {
1297 if (dst.regClass() == v1) {
1298 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1299 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1300 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1301 } else if (dst.regClass() == s1) {
1302 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1303 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1304 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1305 } else {
1306 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1307 nir_print_instr(&instr->instr, stderr);
1308 fprintf(stderr, "\n");
1309 }
1310 break;
1311 }
1312 case nir_op_fmul: {
1313 if (dst.size() == 1) {
1314 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1315 } else if (dst.size() == 2) {
1316 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1317 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1318 } else {
1319 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1320 nir_print_instr(&instr->instr, stderr);
1321 fprintf(stderr, "\n");
1322 }
1323 break;
1324 }
1325 case nir_op_fadd: {
1326 if (dst.size() == 1) {
1327 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1328 } else if (dst.size() == 2) {
1329 bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1330 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1331 } else {
1332 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1333 nir_print_instr(&instr->instr, stderr);
1334 fprintf(stderr, "\n");
1335 }
1336 break;
1337 }
1338 case nir_op_fsub: {
1339 Temp src0 = get_alu_src(ctx, instr->src[0]);
1340 Temp src1 = get_alu_src(ctx, instr->src[1]);
1341 if (dst.size() == 1) {
1342 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1343 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1344 else
1345 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1346 } else if (dst.size() == 2) {
1347 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1348 get_alu_src(ctx, instr->src[0]),
1349 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1350 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1351 sub->neg[1] = true;
1352 } else {
1353 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1354 nir_print_instr(&instr->instr, stderr);
1355 fprintf(stderr, "\n");
1356 }
1357 break;
1358 }
1359 case nir_op_fmax: {
1360 if (dst.size() == 1) {
1361 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1362 } else if (dst.size() == 2) {
1363 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1364 Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2),
1365 get_alu_src(ctx, instr->src[0]),
1366 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1367 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1368 } else {
1369 bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1370 get_alu_src(ctx, instr->src[0]),
1371 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1372 }
1373 } else {
1374 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1375 nir_print_instr(&instr->instr, stderr);
1376 fprintf(stderr, "\n");
1377 }
1378 break;
1379 }
1380 case nir_op_fmin: {
1381 if (dst.size() == 1) {
1382 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1383 } else if (dst.size() == 2) {
1384 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1385 Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2),
1386 get_alu_src(ctx, instr->src[0]),
1387 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1388 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1389 } else {
1390 bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1391 get_alu_src(ctx, instr->src[0]),
1392 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1393 }
1394 } else {
1395 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1396 nir_print_instr(&instr->instr, stderr);
1397 fprintf(stderr, "\n");
1398 }
1399 break;
1400 }
1401 case nir_op_fmax3: {
1402 if (dst.size() == 1) {
1403 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1404 } else {
1405 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1406 nir_print_instr(&instr->instr, stderr);
1407 fprintf(stderr, "\n");
1408 }
1409 break;
1410 }
1411 case nir_op_fmin3: {
1412 if (dst.size() == 1) {
1413 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1414 } else {
1415 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1416 nir_print_instr(&instr->instr, stderr);
1417 fprintf(stderr, "\n");
1418 }
1419 break;
1420 }
1421 case nir_op_fmed3: {
1422 if (dst.size() == 1) {
1423 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1424 } else {
1425 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1426 nir_print_instr(&instr->instr, stderr);
1427 fprintf(stderr, "\n");
1428 }
1429 break;
1430 }
1431 case nir_op_umax3: {
1432 if (dst.size() == 1) {
1433 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1434 } else {
1435 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1436 nir_print_instr(&instr->instr, stderr);
1437 fprintf(stderr, "\n");
1438 }
1439 break;
1440 }
1441 case nir_op_umin3: {
1442 if (dst.size() == 1) {
1443 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1444 } else {
1445 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1446 nir_print_instr(&instr->instr, stderr);
1447 fprintf(stderr, "\n");
1448 }
1449 break;
1450 }
1451 case nir_op_umed3: {
1452 if (dst.size() == 1) {
1453 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1454 } else {
1455 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1456 nir_print_instr(&instr->instr, stderr);
1457 fprintf(stderr, "\n");
1458 }
1459 break;
1460 }
1461 case nir_op_imax3: {
1462 if (dst.size() == 1) {
1463 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1464 } else {
1465 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1466 nir_print_instr(&instr->instr, stderr);
1467 fprintf(stderr, "\n");
1468 }
1469 break;
1470 }
1471 case nir_op_imin3: {
1472 if (dst.size() == 1) {
1473 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1474 } else {
1475 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1476 nir_print_instr(&instr->instr, stderr);
1477 fprintf(stderr, "\n");
1478 }
1479 break;
1480 }
1481 case nir_op_imed3: {
1482 if (dst.size() == 1) {
1483 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1484 } else {
1485 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1486 nir_print_instr(&instr->instr, stderr);
1487 fprintf(stderr, "\n");
1488 }
1489 break;
1490 }
1491 case nir_op_cube_face_coord: {
1492 Temp in = get_alu_src(ctx, instr->src[0], 3);
1493 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1494 emit_extract_vector(ctx, in, 1, v1),
1495 emit_extract_vector(ctx, in, 2, v1) };
1496 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1497 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1498 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1499 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1500 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1501 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1502 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1503 break;
1504 }
1505 case nir_op_cube_face_index: {
1506 Temp in = get_alu_src(ctx, instr->src[0], 3);
1507 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1508 emit_extract_vector(ctx, in, 1, v1),
1509 emit_extract_vector(ctx, in, 2, v1) };
1510 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1511 break;
1512 }
1513 case nir_op_bcsel: {
1514 emit_bcsel(ctx, instr, dst);
1515 break;
1516 }
1517 case nir_op_frsq: {
1518 if (dst.size() == 1) {
1519 emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1520 } else if (dst.size() == 2) {
1521 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1522 } else {
1523 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1524 nir_print_instr(&instr->instr, stderr);
1525 fprintf(stderr, "\n");
1526 }
1527 break;
1528 }
1529 case nir_op_fneg: {
1530 Temp src = get_alu_src(ctx, instr->src[0]);
1531 if (dst.size() == 1) {
1532 if (ctx->block->fp_mode.must_flush_denorms32)
1533 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1534 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1535 } else if (dst.size() == 2) {
1536 if (ctx->block->fp_mode.must_flush_denorms16_64)
1537 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1538 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1539 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1540 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1541 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1542 } else {
1543 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1544 nir_print_instr(&instr->instr, stderr);
1545 fprintf(stderr, "\n");
1546 }
1547 break;
1548 }
1549 case nir_op_fabs: {
1550 Temp src = get_alu_src(ctx, instr->src[0]);
1551 if (dst.size() == 1) {
1552 if (ctx->block->fp_mode.must_flush_denorms32)
1553 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1554 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1555 } else if (dst.size() == 2) {
1556 if (ctx->block->fp_mode.must_flush_denorms16_64)
1557 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1558 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1559 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1560 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1561 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1562 } else {
1563 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1564 nir_print_instr(&instr->instr, stderr);
1565 fprintf(stderr, "\n");
1566 }
1567 break;
1568 }
1569 case nir_op_fsat: {
1570 Temp src = get_alu_src(ctx, instr->src[0]);
1571 if (dst.size() == 1) {
1572 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1573 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
1574 // TODO: confirm that this holds under any circumstances
1575 } else if (dst.size() == 2) {
1576 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1577 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1578 vop3->clamp = true;
1579 } else {
1580 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1581 nir_print_instr(&instr->instr, stderr);
1582 fprintf(stderr, "\n");
1583 }
1584 break;
1585 }
1586 case nir_op_flog2: {
1587 if (dst.size() == 1) {
1588 emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1589 } else {
1590 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1591 nir_print_instr(&instr->instr, stderr);
1592 fprintf(stderr, "\n");
1593 }
1594 break;
1595 }
1596 case nir_op_frcp: {
1597 if (dst.size() == 1) {
1598 emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1599 } else if (dst.size() == 2) {
1600 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1601 } else {
1602 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1603 nir_print_instr(&instr->instr, stderr);
1604 fprintf(stderr, "\n");
1605 }
1606 break;
1607 }
1608 case nir_op_fexp2: {
1609 if (dst.size() == 1) {
1610 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1611 } else {
1612 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1613 nir_print_instr(&instr->instr, stderr);
1614 fprintf(stderr, "\n");
1615 }
1616 break;
1617 }
1618 case nir_op_fsqrt: {
1619 if (dst.size() == 1) {
1620 emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1621 } else if (dst.size() == 2) {
1622 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1623 } else {
1624 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1625 nir_print_instr(&instr->instr, stderr);
1626 fprintf(stderr, "\n");
1627 }
1628 break;
1629 }
1630 case nir_op_ffract: {
1631 if (dst.size() == 1) {
1632 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1633 } else if (dst.size() == 2) {
1634 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1635 } else {
1636 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1637 nir_print_instr(&instr->instr, stderr);
1638 fprintf(stderr, "\n");
1639 }
1640 break;
1641 }
1642 case nir_op_ffloor: {
1643 if (dst.size() == 1) {
1644 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1645 } else if (dst.size() == 2) {
1646 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1647 } else {
1648 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1649 nir_print_instr(&instr->instr, stderr);
1650 fprintf(stderr, "\n");
1651 }
1652 break;
1653 }
1654 case nir_op_fceil: {
1655 if (dst.size() == 1) {
1656 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1657 } else if (dst.size() == 2) {
1658 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1659 } else {
1660 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1661 nir_print_instr(&instr->instr, stderr);
1662 fprintf(stderr, "\n");
1663 }
1664 break;
1665 }
1666 case nir_op_ftrunc: {
1667 if (dst.size() == 1) {
1668 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1669 } else if (dst.size() == 2) {
1670 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1671 } else {
1672 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1673 nir_print_instr(&instr->instr, stderr);
1674 fprintf(stderr, "\n");
1675 }
1676 break;
1677 }
1678 case nir_op_fround_even: {
1679 if (dst.size() == 1) {
1680 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1681 } else if (dst.size() == 2) {
1682 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1683 } else {
1684 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1685 nir_print_instr(&instr->instr, stderr);
1686 fprintf(stderr, "\n");
1687 }
1688 break;
1689 }
1690 case nir_op_fsin:
1691 case nir_op_fcos: {
1692 Temp src = get_alu_src(ctx, instr->src[0]);
1693 aco_ptr<Instruction> norm;
1694 if (dst.size() == 1) {
1695 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
1696 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src));
1697
1698 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1699 if (ctx->options->chip_class < GFX9)
1700 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1701
1702 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1703 bld.vop1(opcode, Definition(dst), tmp);
1704 } else {
1705 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1706 nir_print_instr(&instr->instr, stderr);
1707 fprintf(stderr, "\n");
1708 }
1709 break;
1710 }
1711 case nir_op_ldexp: {
1712 if (dst.size() == 1) {
1713 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1714 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1715 get_alu_src(ctx, instr->src[1]));
1716 } else if (dst.size() == 2) {
1717 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1718 as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1719 get_alu_src(ctx, instr->src[1]));
1720 } else {
1721 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1722 nir_print_instr(&instr->instr, stderr);
1723 fprintf(stderr, "\n");
1724 }
1725 break;
1726 }
1727 case nir_op_frexp_sig: {
1728 if (dst.size() == 1) {
1729 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1730 get_alu_src(ctx, instr->src[0]));
1731 } else if (dst.size() == 2) {
1732 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1733 get_alu_src(ctx, instr->src[0]));
1734 } else {
1735 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1736 nir_print_instr(&instr->instr, stderr);
1737 fprintf(stderr, "\n");
1738 }
1739 break;
1740 }
1741 case nir_op_frexp_exp: {
1742 if (instr->src[0].src.ssa->bit_size == 32) {
1743 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1744 get_alu_src(ctx, instr->src[0]));
1745 } else if (instr->src[0].src.ssa->bit_size == 64) {
1746 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1747 get_alu_src(ctx, instr->src[0]));
1748 } else {
1749 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1750 nir_print_instr(&instr->instr, stderr);
1751 fprintf(stderr, "\n");
1752 }
1753 break;
1754 }
1755 case nir_op_fsign: {
1756 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1757 if (dst.size() == 1) {
1758 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1759 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1760 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1761 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1762 } else if (dst.size() == 2) {
1763 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1764 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1765 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
1766
1767 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1768 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1769 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1770
1771 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1772 } else {
1773 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1774 nir_print_instr(&instr->instr, stderr);
1775 fprintf(stderr, "\n");
1776 }
1777 break;
1778 }
1779 case nir_op_f2f32: {
1780 if (instr->src[0].src.ssa->bit_size == 64) {
1781 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1782 } else {
1783 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1784 nir_print_instr(&instr->instr, stderr);
1785 fprintf(stderr, "\n");
1786 }
1787 break;
1788 }
1789 case nir_op_f2f64: {
1790 if (instr->src[0].src.ssa->bit_size == 32) {
1791 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1792 } else {
1793 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1794 nir_print_instr(&instr->instr, stderr);
1795 fprintf(stderr, "\n");
1796 }
1797 break;
1798 }
1799 case nir_op_i2f32: {
1800 assert(dst.size() == 1);
1801 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1802 break;
1803 }
1804 case nir_op_i2f64: {
1805 if (instr->src[0].src.ssa->bit_size == 32) {
1806 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1807 } else if (instr->src[0].src.ssa->bit_size == 64) {
1808 Temp src = get_alu_src(ctx, instr->src[0]);
1809 RegClass rc = RegClass(src.type(), 1);
1810 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1811 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1812 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1813 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1814 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1815 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1816
1817 } else {
1818 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1819 nir_print_instr(&instr->instr, stderr);
1820 fprintf(stderr, "\n");
1821 }
1822 break;
1823 }
1824 case nir_op_u2f32: {
1825 assert(dst.size() == 1);
1826 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1827 break;
1828 }
1829 case nir_op_u2f64: {
1830 if (instr->src[0].src.ssa->bit_size == 32) {
1831 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1832 } else if (instr->src[0].src.ssa->bit_size == 64) {
1833 Temp src = get_alu_src(ctx, instr->src[0]);
1834 RegClass rc = RegClass(src.type(), 1);
1835 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1836 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1837 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1838 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1839 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1840 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1841 } else {
1842 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1843 nir_print_instr(&instr->instr, stderr);
1844 fprintf(stderr, "\n");
1845 }
1846 break;
1847 }
1848 case nir_op_f2i32: {
1849 Temp src = get_alu_src(ctx, instr->src[0]);
1850 if (instr->src[0].src.ssa->bit_size == 32) {
1851 if (dst.type() == RegType::vgpr)
1852 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1853 else
1854 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1855 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1856
1857 } else if (instr->src[0].src.ssa->bit_size == 64) {
1858 if (dst.type() == RegType::vgpr)
1859 bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1860 else
1861 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1862 bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1863
1864 } else {
1865 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1866 nir_print_instr(&instr->instr, stderr);
1867 fprintf(stderr, "\n");
1868 }
1869 break;
1870 }
1871 case nir_op_f2u32: {
1872 Temp src = get_alu_src(ctx, instr->src[0]);
1873 if (instr->src[0].src.ssa->bit_size == 32) {
1874 if (dst.type() == RegType::vgpr)
1875 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1876 else
1877 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1878 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1879
1880 } else if (instr->src[0].src.ssa->bit_size == 64) {
1881 if (dst.type() == RegType::vgpr)
1882 bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1883 else
1884 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1885 bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1886
1887 } else {
1888 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1889 nir_print_instr(&instr->instr, stderr);
1890 fprintf(stderr, "\n");
1891 }
1892 break;
1893 }
1894 case nir_op_f2i64: {
1895 Temp src = get_alu_src(ctx, instr->src[0]);
1896 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1897 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1898 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1899 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1900 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1901 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1902 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1903 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1904 Temp new_exponent = bld.tmp(v1);
1905 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1906 if (ctx->program->chip_class >= GFX8)
1907 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1908 else
1909 mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
1910 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1911 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1912 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1913 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1914 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1915 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1916 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1917 Temp new_lower = bld.tmp(v1);
1918 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1919 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1920 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1921
1922 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1923 if (src.type() == RegType::vgpr)
1924 src = bld.as_uniform(src);
1925 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1926 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1927 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1928 exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1929 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1930 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1931 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1932 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1933 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1934 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1935 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1936 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1937 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1938 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1939 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1940 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1941 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1942 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1943 Temp borrow = bld.tmp(s1);
1944 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1945 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1946 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1947
1948 } else if (instr->src[0].src.ssa->bit_size == 64) {
1949 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1950 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1951 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1952 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1953 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1954 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1955 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1956 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1957 if (dst.type() == RegType::sgpr) {
1958 lower = bld.as_uniform(lower);
1959 upper = bld.as_uniform(upper);
1960 }
1961 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1962
1963 } else {
1964 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1965 nir_print_instr(&instr->instr, stderr);
1966 fprintf(stderr, "\n");
1967 }
1968 break;
1969 }
1970 case nir_op_f2u64: {
1971 Temp src = get_alu_src(ctx, instr->src[0]);
1972 if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1973 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1974 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
1975 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1976 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1977 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1978 Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1979 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1980 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1981 Temp new_exponent = bld.tmp(v1);
1982 Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1983 if (ctx->program->chip_class >= GFX8)
1984 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1985 else
1986 mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
1987 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1988 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1989 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1990 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1991 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1992 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1993 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1994
1995 } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1996 if (src.type() == RegType::vgpr)
1997 src = bld.as_uniform(src);
1998 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1999 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2000 exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2001 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2002 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2003 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
2004 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
2005 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2006 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
2007 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
2008 Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
2009 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
2010 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2011 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2012 Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
2013 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2014 upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
2015 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2016
2017 } else if (instr->src[0].src.ssa->bit_size == 64) {
2018 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2019 Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
2020 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2021 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2022 Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
2023 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2024 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2025 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2026 if (dst.type() == RegType::sgpr) {
2027 lower = bld.as_uniform(lower);
2028 upper = bld.as_uniform(upper);
2029 }
2030 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2031
2032 } else {
2033 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2034 nir_print_instr(&instr->instr, stderr);
2035 fprintf(stderr, "\n");
2036 }
2037 break;
2038 }
2039 case nir_op_b2f32: {
2040 Temp src = get_alu_src(ctx, instr->src[0]);
2041 assert(src.regClass() == bld.lm);
2042
2043 if (dst.regClass() == s1) {
2044 src = bool_to_scalar_condition(ctx, src);
2045 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2046 } else if (dst.regClass() == v1) {
2047 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2048 } else {
2049 unreachable("Wrong destination register class for nir_op_b2f32.");
2050 }
2051 break;
2052 }
2053 case nir_op_b2f64: {
2054 Temp src = get_alu_src(ctx, instr->src[0]);
2055 assert(src.regClass() == bld.lm);
2056
2057 if (dst.regClass() == s2) {
2058 src = bool_to_scalar_condition(ctx, src);
2059 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2060 } else if (dst.regClass() == v2) {
2061 Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2062 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2063 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2064 } else {
2065 unreachable("Wrong destination register class for nir_op_b2f64.");
2066 }
2067 break;
2068 }
2069 case nir_op_i2i32: {
2070 Temp src = get_alu_src(ctx, instr->src[0]);
2071 if (instr->src[0].src.ssa->bit_size == 64) {
2072 /* we can actually just say dst = src, as it would map the lower register */
2073 emit_extract_vector(ctx, src, 0, dst);
2074 } else {
2075 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2076 nir_print_instr(&instr->instr, stderr);
2077 fprintf(stderr, "\n");
2078 }
2079 break;
2080 }
2081 case nir_op_u2u32: {
2082 Temp src = get_alu_src(ctx, instr->src[0]);
2083 if (instr->src[0].src.ssa->bit_size == 16) {
2084 if (dst.regClass() == s1) {
2085 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
2086 } else {
2087 // TODO: do better with SDWA
2088 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
2089 }
2090 } else if (instr->src[0].src.ssa->bit_size == 64) {
2091 /* we can actually just say dst = src, as it would map the lower register */
2092 emit_extract_vector(ctx, src, 0, dst);
2093 } else {
2094 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2095 nir_print_instr(&instr->instr, stderr);
2096 fprintf(stderr, "\n");
2097 }
2098 break;
2099 }
2100 case nir_op_i2i64: {
2101 Temp src = get_alu_src(ctx, instr->src[0]);
2102 if (src.regClass() == s1) {
2103 Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2104 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2105 } else if (src.regClass() == v1) {
2106 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2107 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2108 } else {
2109 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2110 nir_print_instr(&instr->instr, stderr);
2111 fprintf(stderr, "\n");
2112 }
2113 break;
2114 }
2115 case nir_op_u2u64: {
2116 Temp src = get_alu_src(ctx, instr->src[0]);
2117 if (instr->src[0].src.ssa->bit_size == 32) {
2118 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
2119 } else {
2120 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2121 nir_print_instr(&instr->instr, stderr);
2122 fprintf(stderr, "\n");
2123 }
2124 break;
2125 }
2126 case nir_op_b2i32: {
2127 Temp src = get_alu_src(ctx, instr->src[0]);
2128 assert(src.regClass() == bld.lm);
2129
2130 if (dst.regClass() == s1) {
2131 // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2132 bool_to_scalar_condition(ctx, src, dst);
2133 } else if (dst.regClass() == v1) {
2134 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2135 } else {
2136 unreachable("Invalid register class for b2i32");
2137 }
2138 break;
2139 }
2140 case nir_op_i2b1: {
2141 Temp src = get_alu_src(ctx, instr->src[0]);
2142 assert(dst.regClass() == bld.lm);
2143
2144 if (src.type() == RegType::vgpr) {
2145 assert(src.regClass() == v1 || src.regClass() == v2);
2146 assert(dst.regClass() == bld.lm);
2147 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2148 Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2149 } else {
2150 assert(src.regClass() == s1 || src.regClass() == s2);
2151 Temp tmp;
2152 if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
2153 tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
2154 } else {
2155 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2156 bld.scc(bld.def(s1)), Operand(0u), src);
2157 }
2158 bool_to_vector_condition(ctx, tmp, dst);
2159 }
2160 break;
2161 }
2162 case nir_op_pack_64_2x32_split: {
2163 Temp src0 = get_alu_src(ctx, instr->src[0]);
2164 Temp src1 = get_alu_src(ctx, instr->src[1]);
2165
2166 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2167 break;
2168 }
2169 case nir_op_unpack_64_2x32_split_x:
2170 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2171 break;
2172 case nir_op_unpack_64_2x32_split_y:
2173 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2174 break;
2175 case nir_op_pack_half_2x16: {
2176 Temp src = get_alu_src(ctx, instr->src[0], 2);
2177
2178 if (dst.regClass() == v1) {
2179 Temp src0 = bld.tmp(v1);
2180 Temp src1 = bld.tmp(v1);
2181 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2182 if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2183 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2184 else
2185 bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2186 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2187 bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2188 } else {
2189 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2190 nir_print_instr(&instr->instr, stderr);
2191 fprintf(stderr, "\n");
2192 }
2193 break;
2194 }
2195 case nir_op_unpack_half_2x16_split_x: {
2196 if (dst.regClass() == v1) {
2197 Builder bld(ctx->program, ctx->block);
2198 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2199 } else {
2200 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2201 nir_print_instr(&instr->instr, stderr);
2202 fprintf(stderr, "\n");
2203 }
2204 break;
2205 }
2206 case nir_op_unpack_half_2x16_split_y: {
2207 if (dst.regClass() == v1) {
2208 Builder bld(ctx->program, ctx->block);
2209 /* TODO: use SDWA here */
2210 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2211 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2212 } else {
2213 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2214 nir_print_instr(&instr->instr, stderr);
2215 fprintf(stderr, "\n");
2216 }
2217 break;
2218 }
2219 case nir_op_fquantize2f16: {
2220 Temp src = get_alu_src(ctx, instr->src[0]);
2221 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2222 Temp f32, cmp_res;
2223
2224 if (ctx->program->chip_class >= GFX8) {
2225 Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2226 cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2227 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2228 } else {
2229 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2230 * so compare the result and flush to 0 if it's smaller.
2231 */
2232 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2233 Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2234 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
2235 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2236 cmp_res = vop3->definitions[0].getTemp();
2237 }
2238
2239 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2240 Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2241 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2242 } else {
2243 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2244 }
2245 break;
2246 }
2247 case nir_op_bfm: {
2248 Temp bits = get_alu_src(ctx, instr->src[0]);
2249 Temp offset = get_alu_src(ctx, instr->src[1]);
2250
2251 if (dst.regClass() == s1) {
2252 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2253 } else if (dst.regClass() == v1) {
2254 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2255 } else {
2256 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2257 nir_print_instr(&instr->instr, stderr);
2258 fprintf(stderr, "\n");
2259 }
2260 break;
2261 }
2262 case nir_op_bitfield_select: {
2263 /* (mask & insert) | (~mask & base) */
2264 Temp bitmask = get_alu_src(ctx, instr->src[0]);
2265 Temp insert = get_alu_src(ctx, instr->src[1]);
2266 Temp base = get_alu_src(ctx, instr->src[2]);
2267
2268 /* dst = (insert & bitmask) | (base & ~bitmask) */
2269 if (dst.regClass() == s1) {
2270 aco_ptr<Instruction> sop2;
2271 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2272 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2273 Operand lhs;
2274 if (const_insert && const_bitmask) {
2275 lhs = Operand(const_insert->u32 & const_bitmask->u32);
2276 } else {
2277 insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2278 lhs = Operand(insert);
2279 }
2280
2281 Operand rhs;
2282 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2283 if (const_base && const_bitmask) {
2284 rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2285 } else {
2286 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2287 rhs = Operand(base);
2288 }
2289
2290 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2291
2292 } else if (dst.regClass() == v1) {
2293 if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2294 base = as_vgpr(ctx, base);
2295 if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2296 insert = as_vgpr(ctx, insert);
2297
2298 bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2299
2300 } else {
2301 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2302 nir_print_instr(&instr->instr, stderr);
2303 fprintf(stderr, "\n");
2304 }
2305 break;
2306 }
2307 case nir_op_ubfe:
2308 case nir_op_ibfe: {
2309 Temp base = get_alu_src(ctx, instr->src[0]);
2310 Temp offset = get_alu_src(ctx, instr->src[1]);
2311 Temp bits = get_alu_src(ctx, instr->src[2]);
2312
2313 if (dst.type() == RegType::sgpr) {
2314 Operand extract;
2315 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2316 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2317 if (const_offset && const_bits) {
2318 uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2319 extract = Operand(const_extract);
2320 } else {
2321 Operand width;
2322 if (const_bits) {
2323 width = Operand(const_bits->u32 << 16);
2324 } else {
2325 width = bld.sop2(aco_opcode::s_lshl_b32, bld.def