Revert "radv: add support for MRTs compaction to avoid holes"
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <stack>
29 #include <map>
30
31 #include "ac_shader_util.h"
32 #include "aco_ir.h"
33 #include "aco_builder.h"
34 #include "aco_interface.h"
35 #include "aco_instruction_selection_setup.cpp"
36 #include "util/fast_idiv_by_const.h"
37
38 namespace aco {
39 namespace {
40
41 class loop_info_RAII {
42 isel_context* ctx;
43 unsigned header_idx_old;
44 Block* exit_old;
45 bool divergent_cont_old;
46 bool divergent_branch_old;
47 bool divergent_if_old;
48
49 public:
50 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
51 : ctx(ctx),
52 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
53 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
54 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
55 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
56 {
57 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
58 ctx->cf_info.parent_loop.exit = loop_exit;
59 ctx->cf_info.parent_loop.has_divergent_continue = false;
60 ctx->cf_info.parent_loop.has_divergent_branch = false;
61 ctx->cf_info.parent_if.is_divergent = false;
62 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
63 }
64
65 ~loop_info_RAII()
66 {
67 ctx->cf_info.parent_loop.header_idx = header_idx_old;
68 ctx->cf_info.parent_loop.exit = exit_old;
69 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
70 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
71 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
72 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
73 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
74 ctx->cf_info.exec_potentially_empty_discard = false;
75 }
76 };
77
78 struct if_context {
79 Temp cond;
80
81 bool divergent_old;
82 bool exec_potentially_empty_discard_old;
83 bool exec_potentially_empty_break_old;
84 uint16_t exec_potentially_empty_break_depth_old;
85
86 unsigned BB_if_idx;
87 unsigned invert_idx;
88 bool uniform_has_then_branch;
89 bool then_branch_divergent;
90 Block BB_invert;
91 Block BB_endif;
92 };
93
94 static bool visit_cf_list(struct isel_context *ctx,
95 struct exec_list *list);
96
97 static void add_logical_edge(unsigned pred_idx, Block *succ)
98 {
99 succ->logical_preds.emplace_back(pred_idx);
100 }
101
102
103 static void add_linear_edge(unsigned pred_idx, Block *succ)
104 {
105 succ->linear_preds.emplace_back(pred_idx);
106 }
107
108 static void add_edge(unsigned pred_idx, Block *succ)
109 {
110 add_logical_edge(pred_idx, succ);
111 add_linear_edge(pred_idx, succ);
112 }
113
114 static void append_logical_start(Block *b)
115 {
116 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
117 }
118
119 static void append_logical_end(Block *b)
120 {
121 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
122 }
123
124 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
125 {
126 assert(ctx->allocated[def->index].id());
127 return ctx->allocated[def->index];
128 }
129
130 Temp emit_mbcnt(isel_context *ctx, Definition dst,
131 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
132 {
133 Builder bld(ctx->program, ctx->block);
134 Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
135 Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
136
137 if (ctx->program->wave_size == 32) {
138 return thread_id_lo;
139 } else {
140 Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
141 return thread_id_hi;
142 }
143 }
144
145 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
146 {
147 Builder bld(ctx->program, ctx->block);
148
149 if (!dst.id())
150 dst = bld.tmp(src.regClass());
151
152 assert(src.size() == dst.size());
153
154 if (ctx->stage != fragment_fs) {
155 if (!dst.id())
156 return src;
157
158 bld.copy(Definition(dst), src);
159 return dst;
160 }
161
162 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
163 ctx->program->needs_wqm |= program_needs_wqm;
164 return dst;
165 }
166
167 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
168 {
169 if (index.regClass() == s1)
170 return bld.readlane(bld.def(s1), data, index);
171
172 if (ctx->options->chip_class <= GFX7) {
173 /* GFX6-7: there is no bpermute instruction */
174 Operand index_op(index);
175 Operand input_data(data);
176 index_op.setLateKill(true);
177 input_data.setLateKill(true);
178
179 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
180 } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
181 /* GFX10 wave64 mode: emulate full-wave bpermute */
182 if (!ctx->has_gfx10_wave64_bpermute) {
183 ctx->has_gfx10_wave64_bpermute = true;
184 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
185 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
186 }
187
188 Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
189 Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
190 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
191 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
192 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
193 Operand input_data(data);
194
195 index_x4.setLateKill(true);
196 input_data.setLateKill(true);
197 same_half.setLateKill(true);
198
199 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
200 } else {
201 /* GFX8-9 or GFX10 wave32: bpermute works normally */
202 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
203 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
204 }
205 }
206
207 Temp as_vgpr(isel_context *ctx, Temp val)
208 {
209 if (val.type() == RegType::sgpr) {
210 Builder bld(ctx->program, ctx->block);
211 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
212 }
213 assert(val.type() == RegType::vgpr);
214 return val;
215 }
216
217 //assumes a != 0xffffffff
218 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
219 {
220 assert(b != 0);
221 Builder bld(ctx->program, ctx->block);
222
223 if (util_is_power_of_two_or_zero(b)) {
224 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
225 return;
226 }
227
228 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
229
230 assert(info.multiplier <= 0xffffffff);
231
232 bool pre_shift = info.pre_shift != 0;
233 bool increment = info.increment != 0;
234 bool multiply = true;
235 bool post_shift = info.post_shift != 0;
236
237 if (!pre_shift && !increment && !multiply && !post_shift) {
238 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
239 return;
240 }
241
242 Temp pre_shift_dst = a;
243 if (pre_shift) {
244 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
245 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
246 }
247
248 Temp increment_dst = pre_shift_dst;
249 if (increment) {
250 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
251 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
252 }
253
254 Temp multiply_dst = increment_dst;
255 if (multiply) {
256 multiply_dst = post_shift ? bld.tmp(v1) : dst;
257 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
258 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
259 }
260
261 if (post_shift) {
262 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
263 }
264 }
265
266 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
267 {
268 Builder bld(ctx->program, ctx->block);
269 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
270 }
271
272
273 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
274 {
275 /* no need to extract the whole vector */
276 if (src.regClass() == dst_rc) {
277 assert(idx == 0);
278 return src;
279 }
280
281 assert(src.bytes() > (idx * dst_rc.bytes()));
282 Builder bld(ctx->program, ctx->block);
283 auto it = ctx->allocated_vec.find(src.id());
284 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
285 if (it->second[idx].regClass() == dst_rc) {
286 return it->second[idx];
287 } else {
288 assert(!dst_rc.is_subdword());
289 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
290 return bld.copy(bld.def(dst_rc), it->second[idx]);
291 }
292 }
293
294 if (dst_rc.is_subdword())
295 src = as_vgpr(ctx, src);
296
297 if (src.bytes() == dst_rc.bytes()) {
298 assert(idx == 0);
299 return bld.copy(bld.def(dst_rc), src);
300 } else {
301 Temp dst = bld.tmp(dst_rc);
302 emit_extract_vector(ctx, src, idx, dst);
303 return dst;
304 }
305 }
306
307 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
308 {
309 if (num_components == 1)
310 return;
311 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
312 return;
313 RegClass rc;
314 if (num_components > vec_src.size()) {
315 if (vec_src.type() == RegType::sgpr) {
316 /* should still help get_alu_src() */
317 emit_split_vector(ctx, vec_src, vec_src.size());
318 return;
319 }
320 /* sub-dword split */
321 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
322 } else {
323 rc = RegClass(vec_src.type(), vec_src.size() / num_components);
324 }
325 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
326 split->operands[0] = Operand(vec_src);
327 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
328 for (unsigned i = 0; i < num_components; i++) {
329 elems[i] = {ctx->program->allocateId(), rc};
330 split->definitions[i] = Definition(elems[i]);
331 }
332 ctx->block->instructions.emplace_back(std::move(split));
333 ctx->allocated_vec.emplace(vec_src.id(), elems);
334 }
335
336 /* This vector expansion uses a mask to determine which elements in the new vector
337 * come from the original vector. The other elements are undefined. */
338 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
339 {
340 emit_split_vector(ctx, vec_src, util_bitcount(mask));
341
342 if (vec_src == dst)
343 return;
344
345 Builder bld(ctx->program, ctx->block);
346 if (num_components == 1) {
347 if (dst.type() == RegType::sgpr)
348 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
349 else
350 bld.copy(Definition(dst), vec_src);
351 return;
352 }
353
354 unsigned component_size = dst.size() / num_components;
355 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
356
357 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
358 vec->definitions[0] = Definition(dst);
359 unsigned k = 0;
360 for (unsigned i = 0; i < num_components; i++) {
361 if (mask & (1 << i)) {
362 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
363 if (dst.type() == RegType::sgpr)
364 src = bld.as_uniform(src);
365 vec->operands[i] = Operand(src);
366 } else {
367 vec->operands[i] = Operand(0u);
368 }
369 elems[i] = vec->operands[i].getTemp();
370 }
371 ctx->block->instructions.emplace_back(std::move(vec));
372 ctx->allocated_vec.emplace(dst.id(), elems);
373 }
374
375 /* adjust misaligned small bit size loads */
376 void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
377 {
378 Builder bld(ctx->program, ctx->block);
379 Operand shift;
380 Temp select = Temp();
381 if (offset.isConstant()) {
382 assert(offset.constantValue() && offset.constantValue() < 4);
383 shift = Operand(offset.constantValue() * 8);
384 } else {
385 /* bit_offset = 8 * (offset & 0x3) */
386 Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
387 select = bld.tmp(s1);
388 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
389 }
390
391 if (vec.size() == 1) {
392 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
393 } else if (vec.size() == 2) {
394 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
395 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
396 if (tmp == dst)
397 emit_split_vector(ctx, dst, 2);
398 else
399 emit_extract_vector(ctx, tmp, 0, dst);
400 } else if (vec.size() == 4) {
401 Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
402 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
403 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
404 if (select != Temp())
405 hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
406 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
407 Temp mid = bld.tmp(s1);
408 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
409 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
410 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
411 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
412 emit_split_vector(ctx, dst, 2);
413 }
414 }
415
416 void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
417 {
418 Builder bld(ctx->program, ctx->block);
419 if (offset.isTemp()) {
420 Temp tmp[4] = {vec, vec, vec, vec};
421
422 if (vec.size() == 4) {
423 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
424 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
425 } else if (vec.size() == 3) {
426 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
427 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
428 } else if (vec.size() == 2) {
429 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
430 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
431 }
432 for (unsigned i = 0; i < dst.size(); i++)
433 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
434
435 vec = tmp[0];
436 if (dst.size() == 2)
437 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
438
439 offset = Operand(0u);
440 }
441
442 unsigned num_components = dst.bytes() / component_size;
443 if (vec.regClass() == dst.regClass()) {
444 assert(offset.constantValue() == 0);
445 bld.copy(Definition(dst), vec);
446 emit_split_vector(ctx, dst, num_components);
447 return;
448 }
449
450 emit_split_vector(ctx, vec, vec.bytes() / component_size);
451 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
452 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
453
454 assert(offset.constantValue() % component_size == 0);
455 unsigned skip = offset.constantValue() / component_size;
456 for (unsigned i = 0; i < num_components; i++)
457 elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
458
459 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
460 if (dst.type() == RegType::vgpr) {
461 aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
462 for (unsigned i = 0; i < num_components; i++)
463 create_vec->operands[i] = Operand(elems[i]);
464 create_vec->definitions[0] = Definition(dst);
465 bld.insert(std::move(create_vec));
466
467 /* if dst is sgpr - split the src, but move the original to sgpr. */
468 } else if (skip) {
469 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
470 byte_align_scalar(ctx, vec, offset, dst);
471 } else {
472 assert(dst.size() == vec.size());
473 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
474 }
475
476 ctx->allocated_vec.emplace(dst.id(), elems);
477 }
478
479 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
480 {
481 Builder bld(ctx->program, ctx->block);
482 if (!dst.id())
483 dst = bld.tmp(bld.lm);
484
485 assert(val.regClass() == s1);
486 assert(dst.regClass() == bld.lm);
487
488 return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
489 }
490
491 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
492 {
493 Builder bld(ctx->program, ctx->block);
494 if (!dst.id())
495 dst = bld.tmp(s1);
496
497 assert(val.regClass() == bld.lm);
498 assert(dst.regClass() == s1);
499
500 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
501 Temp tmp = bld.tmp(s1);
502 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
503 return emit_wqm(ctx, tmp, dst);
504 }
505
506 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
507 {
508 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
509 return get_ssa_temp(ctx, src.src.ssa);
510
511 if (src.src.ssa->num_components == size) {
512 bool identity_swizzle = true;
513 for (unsigned i = 0; identity_swizzle && i < size; i++) {
514 if (src.swizzle[i] != i)
515 identity_swizzle = false;
516 }
517 if (identity_swizzle)
518 return get_ssa_temp(ctx, src.src.ssa);
519 }
520
521 Temp vec = get_ssa_temp(ctx, src.src.ssa);
522 unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
523 assert(elem_size > 0);
524 assert(vec.bytes() % elem_size == 0);
525
526 if (elem_size < 4 && vec.type() == RegType::sgpr) {
527 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
528 assert(size == 1);
529 unsigned swizzle = src.swizzle[0];
530 if (vec.size() > 1) {
531 assert(src.src.ssa->bit_size == 16);
532 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
533 swizzle = swizzle & 1;
534 }
535 if (swizzle == 0)
536 return vec;
537
538 Temp dst{ctx->program->allocateId(), s1};
539 aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
540 bfe->operands[0] = Operand(vec);
541 bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
542 bfe->definitions[0] = Definition(dst);
543 bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
544 ctx->block->instructions.emplace_back(std::move(bfe));
545 return dst;
546 }
547
548 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
549 if (size == 1) {
550 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
551 } else {
552 assert(size <= 4);
553 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
554 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
555 for (unsigned i = 0; i < size; ++i) {
556 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
557 vec_instr->operands[i] = Operand{elems[i]};
558 }
559 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
560 vec_instr->definitions[0] = Definition(dst);
561 ctx->block->instructions.emplace_back(std::move(vec_instr));
562 ctx->allocated_vec.emplace(dst.id(), elems);
563 return dst;
564 }
565 }
566
567 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
568 {
569 if (ptr.size() == 2)
570 return ptr;
571 Builder bld(ctx->program, ctx->block);
572 if (ptr.type() == RegType::vgpr)
573 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
574 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
575 ptr, Operand((unsigned)ctx->options->address32_hi));
576 }
577
578 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
579 {
580 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
581 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
582 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
583 sop2->definitions[0] = Definition(dst);
584 if (writes_scc)
585 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
586 ctx->block->instructions.emplace_back(std::move(sop2));
587 }
588
589 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
590 bool commutative, bool swap_srcs=false, bool flush_denorms = false)
591 {
592 Builder bld(ctx->program, ctx->block);
593 bld.is_precise = instr->exact;
594
595 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
596 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
597 if (src1.type() == RegType::sgpr) {
598 if (commutative && src0.type() == RegType::vgpr) {
599 Temp t = src0;
600 src0 = src1;
601 src1 = t;
602 } else {
603 src1 = as_vgpr(ctx, src1);
604 }
605 }
606
607 if (flush_denorms && ctx->program->chip_class < GFX9) {
608 assert(dst.size() == 1);
609 Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
610 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
611 } else {
612 bld.vop2(op, Definition(dst), src0, src1);
613 }
614 }
615
616 void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
617 aco_opcode op, Temp dst)
618 {
619 Builder bld(ctx->program, ctx->block);
620 bld.is_precise = instr->exact;
621
622 Temp src0 = get_alu_src(ctx, instr->src[0]);
623 Temp src1 = get_alu_src(ctx, instr->src[1]);
624
625 if (src1.type() == RegType::sgpr) {
626 assert(src0.type() == RegType::vgpr);
627 std::swap(src0, src1);
628 }
629
630 Temp src00 = bld.tmp(src0.type(), 1);
631 Temp src01 = bld.tmp(src0.type(), 1);
632 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
633 Temp src10 = bld.tmp(v1);
634 Temp src11 = bld.tmp(v1);
635 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
636 Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
637 Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
638 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
639 }
640
641 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
642 bool flush_denorms = false)
643 {
644 Temp src0 = get_alu_src(ctx, instr->src[0]);
645 Temp src1 = get_alu_src(ctx, instr->src[1]);
646 Temp src2 = get_alu_src(ctx, instr->src[2]);
647
648 /* ensure that the instruction has at most 1 sgpr operand
649 * The optimizer will inline constants for us */
650 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
651 src0 = as_vgpr(ctx, src0);
652 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
653 src1 = as_vgpr(ctx, src1);
654 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
655 src2 = as_vgpr(ctx, src2);
656
657 Builder bld(ctx->program, ctx->block);
658 bld.is_precise = instr->exact;
659 if (flush_denorms && ctx->program->chip_class < GFX9) {
660 assert(dst.size() == 1);
661 Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
662 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
663 } else {
664 bld.vop3(op, Definition(dst), src0, src1, src2);
665 }
666 }
667
668 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
669 {
670 Builder bld(ctx->program, ctx->block);
671 bld.is_precise = instr->exact;
672 if (dst.type() == RegType::sgpr)
673 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
674 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
675 else
676 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
677 }
678
679 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
680 {
681 Temp src0 = get_alu_src(ctx, instr->src[0]);
682 Temp src1 = get_alu_src(ctx, instr->src[1]);
683 assert(src0.size() == src1.size());
684
685 aco_ptr<Instruction> vopc;
686 if (src1.type() == RegType::sgpr) {
687 if (src0.type() == RegType::vgpr) {
688 /* to swap the operands, we might also have to change the opcode */
689 switch (op) {
690 case aco_opcode::v_cmp_lt_f16:
691 op = aco_opcode::v_cmp_gt_f16;
692 break;
693 case aco_opcode::v_cmp_ge_f16:
694 op = aco_opcode::v_cmp_le_f16;
695 break;
696 case aco_opcode::v_cmp_lt_i16:
697 op = aco_opcode::v_cmp_gt_i16;
698 break;
699 case aco_opcode::v_cmp_ge_i16:
700 op = aco_opcode::v_cmp_le_i16;
701 break;
702 case aco_opcode::v_cmp_lt_u16:
703 op = aco_opcode::v_cmp_gt_u16;
704 break;
705 case aco_opcode::v_cmp_ge_u16:
706 op = aco_opcode::v_cmp_le_u16;
707 break;
708 case aco_opcode::v_cmp_lt_f32:
709 op = aco_opcode::v_cmp_gt_f32;
710 break;
711 case aco_opcode::v_cmp_ge_f32:
712 op = aco_opcode::v_cmp_le_f32;
713 break;
714 case aco_opcode::v_cmp_lt_i32:
715 op = aco_opcode::v_cmp_gt_i32;
716 break;
717 case aco_opcode::v_cmp_ge_i32:
718 op = aco_opcode::v_cmp_le_i32;
719 break;
720 case aco_opcode::v_cmp_lt_u32:
721 op = aco_opcode::v_cmp_gt_u32;
722 break;
723 case aco_opcode::v_cmp_ge_u32:
724 op = aco_opcode::v_cmp_le_u32;
725 break;
726 case aco_opcode::v_cmp_lt_f64:
727 op = aco_opcode::v_cmp_gt_f64;
728 break;
729 case aco_opcode::v_cmp_ge_f64:
730 op = aco_opcode::v_cmp_le_f64;
731 break;
732 case aco_opcode::v_cmp_lt_i64:
733 op = aco_opcode::v_cmp_gt_i64;
734 break;
735 case aco_opcode::v_cmp_ge_i64:
736 op = aco_opcode::v_cmp_le_i64;
737 break;
738 case aco_opcode::v_cmp_lt_u64:
739 op = aco_opcode::v_cmp_gt_u64;
740 break;
741 case aco_opcode::v_cmp_ge_u64:
742 op = aco_opcode::v_cmp_le_u64;
743 break;
744 default: /* eq and ne are commutative */
745 break;
746 }
747 Temp t = src0;
748 src0 = src1;
749 src1 = t;
750 } else {
751 src1 = as_vgpr(ctx, src1);
752 }
753 }
754
755 Builder bld(ctx->program, ctx->block);
756 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
757 }
758
759 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
760 {
761 Temp src0 = get_alu_src(ctx, instr->src[0]);
762 Temp src1 = get_alu_src(ctx, instr->src[1]);
763 Builder bld(ctx->program, ctx->block);
764
765 assert(dst.regClass() == bld.lm);
766 assert(src0.type() == RegType::sgpr);
767 assert(src1.type() == RegType::sgpr);
768 assert(src0.regClass() == src1.regClass());
769
770 /* Emit the SALU comparison instruction */
771 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
772 /* Turn the result into a per-lane bool */
773 bool_to_vector_condition(ctx, cmp, dst);
774 }
775
776 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
777 aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
778 {
779 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
780 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
781 bool use_valu = s_op == aco_opcode::num_opcodes ||
782 nir_dest_is_divergent(instr->dest.dest) ||
783 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
784 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
785 aco_opcode op = use_valu ? v_op : s_op;
786 assert(op != aco_opcode::num_opcodes);
787 assert(dst.regClass() == ctx->program->lane_mask);
788
789 if (use_valu)
790 emit_vopc_instruction(ctx, instr, op, dst);
791 else
792 emit_sopc_instruction(ctx, instr, op, dst);
793 }
794
795 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
796 {
797 Builder bld(ctx->program, ctx->block);
798 Temp src0 = get_alu_src(ctx, instr->src[0]);
799 Temp src1 = get_alu_src(ctx, instr->src[1]);
800
801 assert(dst.regClass() == bld.lm);
802 assert(src0.regClass() == bld.lm);
803 assert(src1.regClass() == bld.lm);
804
805 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
806 }
807
808 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
809 {
810 Builder bld(ctx->program, ctx->block);
811 Temp cond = get_alu_src(ctx, instr->src[0]);
812 Temp then = get_alu_src(ctx, instr->src[1]);
813 Temp els = get_alu_src(ctx, instr->src[2]);
814
815 assert(cond.regClass() == bld.lm);
816
817 if (dst.type() == RegType::vgpr) {
818 aco_ptr<Instruction> bcsel;
819 if (dst.size() == 1) {
820 then = as_vgpr(ctx, then);
821 els = as_vgpr(ctx, els);
822
823 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
824 } else if (dst.size() == 2) {
825 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
826 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
827 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
828 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
829
830 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
831 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
832
833 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
834 } else {
835 fprintf(stderr, "Unimplemented NIR instr bit size: ");
836 nir_print_instr(&instr->instr, stderr);
837 fprintf(stderr, "\n");
838 }
839 return;
840 }
841
842 if (instr->dest.dest.ssa.bit_size == 1) {
843 assert(dst.regClass() == bld.lm);
844 assert(then.regClass() == bld.lm);
845 assert(els.regClass() == bld.lm);
846 }
847
848 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
849 if (dst.regClass() == s1 || dst.regClass() == s2) {
850 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
851 assert(dst.size() == then.size());
852 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
853 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
854 } else {
855 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
856 nir_print_instr(&instr->instr, stderr);
857 fprintf(stderr, "\n");
858 }
859 return;
860 }
861
862 /* divergent boolean bcsel
863 * this implements bcsel on bools: dst = s0 ? s1 : s2
864 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
865 assert(instr->dest.dest.ssa.bit_size == 1);
866
867 if (cond.id() != then.id())
868 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
869
870 if (cond.id() == els.id())
871 bld.sop1(Builder::s_mov, Definition(dst), then);
872 else
873 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
874 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
875 }
876
877 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
878 aco_opcode op, uint32_t undo)
879 {
880 /* multiply by 16777216 to handle denormals */
881 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
882 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
883 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
884 scaled = bld.vop1(op, bld.def(v1), scaled);
885 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
886
887 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
888
889 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
890 }
891
892 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
893 {
894 if (ctx->block->fp_mode.denorm32 == 0) {
895 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
896 return;
897 }
898
899 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
900 }
901
902 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
903 {
904 if (ctx->block->fp_mode.denorm32 == 0) {
905 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
906 return;
907 }
908
909 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
910 }
911
912 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
913 {
914 if (ctx->block->fp_mode.denorm32 == 0) {
915 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
916 return;
917 }
918
919 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
920 }
921
922 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
923 {
924 if (ctx->block->fp_mode.denorm32 == 0) {
925 bld.vop1(aco_opcode::v_log_f32, dst, val);
926 return;
927 }
928
929 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
930 }
931
932 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
933 {
934 if (ctx->options->chip_class >= GFX7)
935 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
936
937 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
938 /* TODO: create more efficient code! */
939 if (val.type() == RegType::sgpr)
940 val = as_vgpr(ctx, val);
941
942 /* Split the input value. */
943 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
944 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
945
946 /* Extract the exponent and compute the unbiased value. */
947 Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
948 exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
949
950 /* Extract the fractional part. */
951 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
952 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
953
954 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
955 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
956
957 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
958 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
959 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
960 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
961 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
962
963 /* Get the sign bit. */
964 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
965
966 /* Decide the operation to apply depending on the unbiased exponent. */
967 Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
968 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
969 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
970 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
971 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
972 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
973
974 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
975 }
976
977 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
978 {
979 if (ctx->options->chip_class >= GFX7)
980 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
981
982 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
983 * lowered at NIR level for precision reasons). */
984 Temp src0 = as_vgpr(ctx, val);
985
986 Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
987 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
988
989 Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
990 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
991 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
992
993 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
994 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
995 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
996 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
997
998 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
999 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1000
1001 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1002
1003 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1004 static_cast<VOP3A_instruction*>(add)->neg[1] = true;
1005
1006 return add->definitions[0].getTemp();
1007 }
1008
1009 Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
1010 if (!dst.id()) {
1011 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
1012 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
1013 else
1014 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
1015 }
1016
1017 if (dst.bytes() == src.bytes() && dst_bits < src_bits)
1018 return bld.copy(Definition(dst), src);
1019 else if (dst.bytes() < src.bytes())
1020 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
1021
1022 Temp tmp = dst;
1023 if (dst_bits == 64)
1024 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
1025
1026 if (tmp == src) {
1027 } else if (src.regClass() == s1) {
1028 if (is_signed)
1029 bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
1030 else
1031 bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
1032 } else if (ctx->options->chip_class >= GFX8) {
1033 assert(src_bits != 8 || src.regClass() == v1b);
1034 assert(src_bits != 16 || src.regClass() == v2b);
1035 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
1036 sdwa->operands[0] = Operand(src);
1037 sdwa->definitions[0] = Definition(tmp);
1038 if (is_signed)
1039 sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
1040 else
1041 sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
1042 sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
1043 bld.insert(std::move(sdwa));
1044 } else {
1045 assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
1046 aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
1047 bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
1048 }
1049
1050 if (dst_bits == 64) {
1051 if (is_signed && dst.regClass() == s2) {
1052 Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
1053 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1054 } else if (is_signed && dst.regClass() == v2) {
1055 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
1056 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1057 } else {
1058 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
1059 }
1060 }
1061
1062 return dst;
1063 }
1064
1065 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
1066 {
1067 if (!instr->dest.dest.is_ssa) {
1068 fprintf(stderr, "nir alu dst not in ssa: ");
1069 nir_print_instr(&instr->instr, stderr);
1070 fprintf(stderr, "\n");
1071 abort();
1072 }
1073 Builder bld(ctx->program, ctx->block);
1074 bld.is_precise = instr->exact;
1075 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1076 switch(instr->op) {
1077 case nir_op_vec2:
1078 case nir_op_vec3:
1079 case nir_op_vec4: {
1080 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
1081 unsigned num = instr->dest.dest.ssa.num_components;
1082 for (unsigned i = 0; i < num; ++i)
1083 elems[i] = get_alu_src(ctx, instr->src[i]);
1084
1085 if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1086 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1087 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1088 for (unsigned i = 0; i < num; ++i) {
1089 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1090 vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
1091 else
1092 vec->operands[i] = Operand{elems[i]};
1093 }
1094 vec->definitions[0] = Definition(dst);
1095 ctx->block->instructions.emplace_back(std::move(vec));
1096 ctx->allocated_vec.emplace(dst.id(), elems);
1097 } else {
1098 // TODO: that is a bit suboptimal..
1099 Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
1100 for (unsigned i = 0; i < num - 1; ++i)
1101 if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
1102 elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1103 for (unsigned i = 0; i < num; ++i) {
1104 unsigned bit = i * instr->dest.dest.ssa.bit_size;
1105 if (bit % 32 == 0) {
1106 elems[bit / 32] = elems[i];
1107 } else {
1108 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
1109 elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
1110 elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
1111 }
1112 }
1113 if (dst.size() == 1)
1114 bld.copy(Definition(dst), elems[0]);
1115 else
1116 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
1117 }
1118 break;
1119 }
1120 case nir_op_mov: {
1121 Temp src = get_alu_src(ctx, instr->src[0]);
1122 aco_ptr<Instruction> mov;
1123 if (dst.type() == RegType::sgpr) {
1124 if (src.type() == RegType::vgpr)
1125 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1126 else if (src.regClass() == s1)
1127 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
1128 else if (src.regClass() == s2)
1129 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
1130 else
1131 unreachable("wrong src register class for nir_op_imov");
1132 } else {
1133 if (dst.regClass() == v1)
1134 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
1135 else if (dst.regClass() == v1b ||
1136 dst.regClass() == v2b ||
1137 dst.regClass() == v2)
1138 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
1139 else
1140 unreachable("wrong src register class for nir_op_imov");
1141 }
1142 break;
1143 }
1144 case nir_op_inot: {
1145 Temp src = get_alu_src(ctx, instr->src[0]);
1146 if (instr->dest.dest.ssa.bit_size == 1) {
1147 assert(src.regClass() == bld.lm);
1148 assert(dst.regClass() == bld.lm);
1149 /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1150 Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1151 bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1152 } else if (dst.regClass() == v1) {
1153 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1154 } else if (dst.regClass() == v2) {
1155 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1156 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1157 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1158 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1159 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1160 } else if (dst.type() == RegType::sgpr) {
1161 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1162 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1163 } else {
1164 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1165 nir_print_instr(&instr->instr, stderr);
1166 fprintf(stderr, "\n");
1167 }
1168 break;
1169 }
1170 case nir_op_ineg: {
1171 Temp src = get_alu_src(ctx, instr->src[0]);
1172 if (dst.regClass() == v1) {
1173 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
1174 } else if (dst.regClass() == s1) {
1175 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
1176 } else if (dst.size() == 2) {
1177 Temp src0 = bld.tmp(dst.type(), 1);
1178 Temp src1 = bld.tmp(dst.type(), 1);
1179 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
1180
1181 if (dst.regClass() == s2) {
1182 Temp carry = bld.tmp(s1);
1183 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
1184 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
1185 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1186 } else {
1187 Temp lower = bld.tmp(v1);
1188 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
1189 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
1190 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1191 }
1192 } else {
1193 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1194 nir_print_instr(&instr->instr, stderr);
1195 fprintf(stderr, "\n");
1196 }
1197 break;
1198 }
1199 case nir_op_iabs: {
1200 if (dst.regClass() == s1) {
1201 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
1202 } else if (dst.regClass() == v1) {
1203 Temp src = get_alu_src(ctx, instr->src[0]);
1204 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
1205 } else {
1206 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1207 nir_print_instr(&instr->instr, stderr);
1208 fprintf(stderr, "\n");
1209 }
1210 break;
1211 }
1212 case nir_op_isign: {
1213 Temp src = get_alu_src(ctx, instr->src[0]);
1214 if (dst.regClass() == s1) {
1215 Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
1216 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
1217 } else if (dst.regClass() == s2) {
1218 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1219 Temp neqz;
1220 if (ctx->program->chip_class >= GFX8)
1221 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
1222 else
1223 neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
1224 /* SCC gets zero-extended to 64 bit */
1225 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1226 } else if (dst.regClass() == v1) {
1227 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
1228 } else if (dst.regClass() == v2) {
1229 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1230 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1231 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1232 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
1233 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
1234 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1235 } else {
1236 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1237 nir_print_instr(&instr->instr, stderr);
1238 fprintf(stderr, "\n");
1239 }
1240 break;
1241 }
1242 case nir_op_imax: {
1243 if (dst.regClass() == v1) {
1244 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1245 } else if (dst.regClass() == s1) {
1246 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1247 } else {
1248 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1249 nir_print_instr(&instr->instr, stderr);
1250 fprintf(stderr, "\n");
1251 }
1252 break;
1253 }
1254 case nir_op_umax: {
1255 if (dst.regClass() == v1) {
1256 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1257 } else if (dst.regClass() == s1) {
1258 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1259 } else {
1260 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1261 nir_print_instr(&instr->instr, stderr);
1262 fprintf(stderr, "\n");
1263 }
1264 break;
1265 }
1266 case nir_op_imin: {
1267 if (dst.regClass() == v1) {
1268 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1269 } else if (dst.regClass() == s1) {
1270 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1271 } else {
1272 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1273 nir_print_instr(&instr->instr, stderr);
1274 fprintf(stderr, "\n");
1275 }
1276 break;
1277 }
1278 case nir_op_umin: {
1279 if (dst.regClass() == v1) {
1280 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1281 } else if (dst.regClass() == s1) {
1282 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1283 } else {
1284 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1285 nir_print_instr(&instr->instr, stderr);
1286 fprintf(stderr, "\n");
1287 }
1288 break;
1289 }
1290 case nir_op_ior: {
1291 if (instr->dest.dest.ssa.bit_size == 1) {
1292 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1293 } else if (dst.regClass() == v1) {
1294 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1295 } else if (dst.regClass() == v2) {
1296 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1297 } else if (dst.regClass() == s1) {
1298 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1299 } else if (dst.regClass() == s2) {
1300 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1301 } else {
1302 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1303 nir_print_instr(&instr->instr, stderr);
1304 fprintf(stderr, "\n");
1305 }
1306 break;
1307 }
1308 case nir_op_iand: {
1309 if (instr->dest.dest.ssa.bit_size == 1) {
1310 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1311 } else if (dst.regClass() == v1) {
1312 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1313 } else if (dst.regClass() == v2) {
1314 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1315 } else if (dst.regClass() == s1) {
1316 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1317 } else if (dst.regClass() == s2) {
1318 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1319 } else {
1320 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1321 nir_print_instr(&instr->instr, stderr);
1322 fprintf(stderr, "\n");
1323 }
1324 break;
1325 }
1326 case nir_op_ixor: {
1327 if (instr->dest.dest.ssa.bit_size == 1) {
1328 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1329 } else if (dst.regClass() == v1) {
1330 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1331 } else if (dst.regClass() == v2) {
1332 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1333 } else if (dst.regClass() == s1) {
1334 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1335 } else if (dst.regClass() == s2) {
1336 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1337 } else {
1338 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1339 nir_print_instr(&instr->instr, stderr);
1340 fprintf(stderr, "\n");
1341 }
1342 break;
1343 }
1344 case nir_op_ushr: {
1345 if (dst.regClass() == v1) {
1346 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1347 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1348 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1349 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1350 } else if (dst.regClass() == v2) {
1351 bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1352 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1353 } else if (dst.regClass() == s2) {
1354 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1355 } else if (dst.regClass() == s1) {
1356 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1357 } else {
1358 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1359 nir_print_instr(&instr->instr, stderr);
1360 fprintf(stderr, "\n");
1361 }
1362 break;
1363 }
1364 case nir_op_ishl: {
1365 if (dst.regClass() == v1) {
1366 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1367 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1368 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1369 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1370 } else if (dst.regClass() == v2) {
1371 bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1372 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1373 } else if (dst.regClass() == s1) {
1374 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1375 } else if (dst.regClass() == s2) {
1376 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1377 } else {
1378 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1379 nir_print_instr(&instr->instr, stderr);
1380 fprintf(stderr, "\n");
1381 }
1382 break;
1383 }
1384 case nir_op_ishr: {
1385 if (dst.regClass() == v1) {
1386 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1387 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1388 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1389 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1390 } else if (dst.regClass() == v2) {
1391 bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1392 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1393 } else if (dst.regClass() == s1) {
1394 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1395 } else if (dst.regClass() == s2) {
1396 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1397 } else {
1398 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1399 nir_print_instr(&instr->instr, stderr);
1400 fprintf(stderr, "\n");
1401 }
1402 break;
1403 }
1404 case nir_op_find_lsb: {
1405 Temp src = get_alu_src(ctx, instr->src[0]);
1406 if (src.regClass() == s1) {
1407 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1408 } else if (src.regClass() == v1) {
1409 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1410 } else if (src.regClass() == s2) {
1411 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1412 } else {
1413 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1414 nir_print_instr(&instr->instr, stderr);
1415 fprintf(stderr, "\n");
1416 }
1417 break;
1418 }
1419 case nir_op_ufind_msb:
1420 case nir_op_ifind_msb: {
1421 Temp src = get_alu_src(ctx, instr->src[0]);
1422 if (src.regClass() == s1 || src.regClass() == s2) {
1423 aco_opcode op = src.regClass() == s2 ?
1424 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1425 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1426 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1427
1428 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1429 Operand(src.size() * 32u - 1u), msb_rev);
1430 Temp msb = sub.def(0).getTemp();
1431 Temp carry = sub.def(1).getTemp();
1432
1433 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1434 } else if (src.regClass() == v1) {
1435 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1436 Temp msb_rev = bld.tmp(v1);
1437 emit_vop1_instruction(ctx, instr, op, msb_rev);
1438 Temp msb = bld.tmp(v1);
1439 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1440 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1441 } else {
1442 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1443 nir_print_instr(&instr->instr, stderr);
1444 fprintf(stderr, "\n");
1445 }
1446 break;
1447 }
1448 case nir_op_bitfield_reverse: {
1449 if (dst.regClass() == s1) {
1450 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1451 } else if (dst.regClass() == v1) {
1452 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1453 } else {
1454 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1455 nir_print_instr(&instr->instr, stderr);
1456 fprintf(stderr, "\n");
1457 }
1458 break;
1459 }
1460 case nir_op_iadd: {
1461 if (dst.regClass() == s1) {
1462 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1463 break;
1464 }
1465
1466 Temp src0 = get_alu_src(ctx, instr->src[0]);
1467 Temp src1 = get_alu_src(ctx, instr->src[1]);
1468 if (dst.regClass() == v1) {
1469 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1470 break;
1471 }
1472
1473 assert(src0.size() == 2 && src1.size() == 2);
1474 Temp src00 = bld.tmp(src0.type(), 1);
1475 Temp src01 = bld.tmp(dst.type(), 1);
1476 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1477 Temp src10 = bld.tmp(src1.type(), 1);
1478 Temp src11 = bld.tmp(dst.type(), 1);
1479 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1480
1481 if (dst.regClass() == s2) {
1482 Temp carry = bld.tmp(s1);
1483 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1484 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1485 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1486 } else if (dst.regClass() == v2) {
1487 Temp dst0 = bld.tmp(v1);
1488 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1489 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1490 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1491 } else {
1492 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1493 nir_print_instr(&instr->instr, stderr);
1494 fprintf(stderr, "\n");
1495 }
1496 break;
1497 }
1498 case nir_op_uadd_sat: {
1499 Temp src0 = get_alu_src(ctx, instr->src[0]);
1500 Temp src1 = get_alu_src(ctx, instr->src[1]);
1501 if (dst.regClass() == s1) {
1502 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1503 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1504 src0, src1);
1505 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1506 } else if (dst.regClass() == v1) {
1507 if (ctx->options->chip_class >= GFX9) {
1508 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1509 add->operands[0] = Operand(src0);
1510 add->operands[1] = Operand(src1);
1511 add->definitions[0] = Definition(dst);
1512 add->clamp = 1;
1513 ctx->block->instructions.emplace_back(std::move(add));
1514 } else {
1515 if (src1.regClass() != v1)
1516 std::swap(src0, src1);
1517 assert(src1.regClass() == v1);
1518 Temp tmp = bld.tmp(v1);
1519 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1520 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1521 }
1522 } else {
1523 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1524 nir_print_instr(&instr->instr, stderr);
1525 fprintf(stderr, "\n");
1526 }
1527 break;
1528 }
1529 case nir_op_uadd_carry: {
1530 Temp src0 = get_alu_src(ctx, instr->src[0]);
1531 Temp src1 = get_alu_src(ctx, instr->src[1]);
1532 if (dst.regClass() == s1) {
1533 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1534 break;
1535 }
1536 if (dst.regClass() == v1) {
1537 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1538 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1539 break;
1540 }
1541
1542 Temp src00 = bld.tmp(src0.type(), 1);
1543 Temp src01 = bld.tmp(dst.type(), 1);
1544 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1545 Temp src10 = bld.tmp(src1.type(), 1);
1546 Temp src11 = bld.tmp(dst.type(), 1);
1547 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1548 if (dst.regClass() == s2) {
1549 Temp carry = bld.tmp(s1);
1550 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1551 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1552 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1553 } else if (dst.regClass() == v2) {
1554 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1555 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1556 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1557 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1558 } else {
1559 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1560 nir_print_instr(&instr->instr, stderr);
1561 fprintf(stderr, "\n");
1562 }
1563 break;
1564 }
1565 case nir_op_isub: {
1566 if (dst.regClass() == s1) {
1567 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1568 break;
1569 }
1570
1571 Temp src0 = get_alu_src(ctx, instr->src[0]);
1572 Temp src1 = get_alu_src(ctx, instr->src[1]);
1573 if (dst.regClass() == v1) {
1574 bld.vsub32(Definition(dst), src0, src1);
1575 break;
1576 }
1577
1578 Temp src00 = bld.tmp(src0.type(), 1);
1579 Temp src01 = bld.tmp(dst.type(), 1);
1580 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1581 Temp src10 = bld.tmp(src1.type(), 1);
1582 Temp src11 = bld.tmp(dst.type(), 1);
1583 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1584 if (dst.regClass() == s2) {
1585 Temp carry = bld.tmp(s1);
1586 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1587 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1588 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1589 } else if (dst.regClass() == v2) {
1590 Temp lower = bld.tmp(v1);
1591 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1592 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1593 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1594 } else {
1595 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1596 nir_print_instr(&instr->instr, stderr);
1597 fprintf(stderr, "\n");
1598 }
1599 break;
1600 }
1601 case nir_op_usub_borrow: {
1602 Temp src0 = get_alu_src(ctx, instr->src[0]);
1603 Temp src1 = get_alu_src(ctx, instr->src[1]);
1604 if (dst.regClass() == s1) {
1605 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1606 break;
1607 } else if (dst.regClass() == v1) {
1608 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1609 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1610 break;
1611 }
1612
1613 Temp src00 = bld.tmp(src0.type(), 1);
1614 Temp src01 = bld.tmp(dst.type(), 1);
1615 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1616 Temp src10 = bld.tmp(src1.type(), 1);
1617 Temp src11 = bld.tmp(dst.type(), 1);
1618 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1619 if (dst.regClass() == s2) {
1620 Temp borrow = bld.tmp(s1);
1621 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1622 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1623 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1624 } else if (dst.regClass() == v2) {
1625 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1626 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1627 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1628 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1629 } else {
1630 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1631 nir_print_instr(&instr->instr, stderr);
1632 fprintf(stderr, "\n");
1633 }
1634 break;
1635 }
1636 case nir_op_imul: {
1637 if (dst.regClass() == v1) {
1638 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1639 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1640 } else if (dst.regClass() == s1) {
1641 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1642 } else {
1643 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1644 nir_print_instr(&instr->instr, stderr);
1645 fprintf(stderr, "\n");
1646 }
1647 break;
1648 }
1649 case nir_op_umul_high: {
1650 if (dst.regClass() == v1) {
1651 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1652 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1653 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1654 } else if (dst.regClass() == s1) {
1655 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1656 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1657 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1658 } else {
1659 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1660 nir_print_instr(&instr->instr, stderr);
1661 fprintf(stderr, "\n");
1662 }
1663 break;
1664 }
1665 case nir_op_imul_high: {
1666 if (dst.regClass() == v1) {
1667 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1668 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1669 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1670 } else if (dst.regClass() == s1) {
1671 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1672 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1673 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1674 } else {
1675 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1676 nir_print_instr(&instr->instr, stderr);
1677 fprintf(stderr, "\n");
1678 }
1679 break;
1680 }
1681 case nir_op_fmul: {
1682 Temp src0 = get_alu_src(ctx, instr->src[0]);
1683 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1684 if (dst.regClass() == v2b) {
1685 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
1686 } else if (dst.regClass() == v1) {
1687 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1688 } else if (dst.regClass() == v2) {
1689 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
1690 } else {
1691 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1692 nir_print_instr(&instr->instr, stderr);
1693 fprintf(stderr, "\n");
1694 }
1695 break;
1696 }
1697 case nir_op_fadd: {
1698 Temp src0 = get_alu_src(ctx, instr->src[0]);
1699 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1700 if (dst.regClass() == v2b) {
1701 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
1702 } else if (dst.regClass() == v1) {
1703 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1704 } else if (dst.regClass() == v2) {
1705 bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
1706 } else {
1707 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1708 nir_print_instr(&instr->instr, stderr);
1709 fprintf(stderr, "\n");
1710 }
1711 break;
1712 }
1713 case nir_op_fsub: {
1714 Temp src0 = get_alu_src(ctx, instr->src[0]);
1715 Temp src1 = get_alu_src(ctx, instr->src[1]);
1716 if (dst.regClass() == v2b) {
1717 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1718 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
1719 else
1720 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
1721 } else if (dst.regClass() == v1) {
1722 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1723 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1724 else
1725 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1726 } else if (dst.regClass() == v2) {
1727 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1728 as_vgpr(ctx, src0), as_vgpr(ctx, src1));
1729 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1730 sub->neg[1] = true;
1731 } else {
1732 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1733 nir_print_instr(&instr->instr, stderr);
1734 fprintf(stderr, "\n");
1735 }
1736 break;
1737 }
1738 case nir_op_fmax: {
1739 Temp src0 = get_alu_src(ctx, instr->src[0]);
1740 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1741 if (dst.regClass() == v2b) {
1742 // TODO: check fp_mode.must_flush_denorms16_64
1743 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
1744 } else if (dst.regClass() == v1) {
1745 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1746 } else if (dst.regClass() == v2) {
1747 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1748 Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
1749 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1750 } else {
1751 bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
1752 }
1753 } else {
1754 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1755 nir_print_instr(&instr->instr, stderr);
1756 fprintf(stderr, "\n");
1757 }
1758 break;
1759 }
1760 case nir_op_fmin: {
1761 Temp src0 = get_alu_src(ctx, instr->src[0]);
1762 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1763 if (dst.regClass() == v2b) {
1764 // TODO: check fp_mode.must_flush_denorms16_64
1765 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
1766 } else if (dst.regClass() == v1) {
1767 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1768 } else if (dst.regClass() == v2) {
1769 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1770 Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
1771 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1772 } else {
1773 bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
1774 }
1775 } else {
1776 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1777 nir_print_instr(&instr->instr, stderr);
1778 fprintf(stderr, "\n");
1779 }
1780 break;
1781 }
1782 case nir_op_fmax3: {
1783 if (dst.regClass() == v2b) {
1784 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
1785 } else if (dst.regClass() == v1) {
1786 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1787 } else {
1788 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1789 nir_print_instr(&instr->instr, stderr);
1790 fprintf(stderr, "\n");
1791 }
1792 break;
1793 }
1794 case nir_op_fmin3: {
1795 if (dst.regClass() == v2b) {
1796 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
1797 } else if (dst.regClass() == v1) {
1798 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1799 } else {
1800 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1801 nir_print_instr(&instr->instr, stderr);
1802 fprintf(stderr, "\n");
1803 }
1804 break;
1805 }
1806 case nir_op_fmed3: {
1807 if (dst.regClass() == v2b) {
1808 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
1809 } else if (dst.regClass() == v1) {
1810 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1811 } else {
1812 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1813 nir_print_instr(&instr->instr, stderr);
1814 fprintf(stderr, "\n");
1815 }
1816 break;
1817 }
1818 case nir_op_umax3: {
1819 if (dst.size() == 1) {
1820 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1821 } else {
1822 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1823 nir_print_instr(&instr->instr, stderr);
1824 fprintf(stderr, "\n");
1825 }
1826 break;
1827 }
1828 case nir_op_umin3: {
1829 if (dst.size() == 1) {
1830 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1831 } else {
1832 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1833 nir_print_instr(&instr->instr, stderr);
1834 fprintf(stderr, "\n");
1835 }
1836 break;
1837 }
1838 case nir_op_umed3: {
1839 if (dst.size() == 1) {
1840 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1841 } else {
1842 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1843 nir_print_instr(&instr->instr, stderr);
1844 fprintf(stderr, "\n");
1845 }
1846 break;
1847 }
1848 case nir_op_imax3: {
1849 if (dst.size() == 1) {
1850 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1851 } else {
1852 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1853 nir_print_instr(&instr->instr, stderr);
1854 fprintf(stderr, "\n");
1855 }
1856 break;
1857 }
1858 case nir_op_imin3: {
1859 if (dst.size() == 1) {
1860 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1861 } else {
1862 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1863 nir_print_instr(&instr->instr, stderr);
1864 fprintf(stderr, "\n");
1865 }
1866 break;
1867 }
1868 case nir_op_imed3: {
1869 if (dst.size() == 1) {
1870 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1871 } else {
1872 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1873 nir_print_instr(&instr->instr, stderr);
1874 fprintf(stderr, "\n");
1875 }
1876 break;
1877 }
1878 case nir_op_cube_face_coord: {
1879 Temp in = get_alu_src(ctx, instr->src[0], 3);
1880 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1881 emit_extract_vector(ctx, in, 1, v1),
1882 emit_extract_vector(ctx, in, 2, v1) };
1883 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1884 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1885 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1886 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1887 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1888 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1889 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1890 break;
1891 }
1892 case nir_op_cube_face_index: {
1893 Temp in = get_alu_src(ctx, instr->src[0], 3);
1894 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1895 emit_extract_vector(ctx, in, 1, v1),
1896 emit_extract_vector(ctx, in, 2, v1) };
1897 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1898 break;
1899 }
1900 case nir_op_bcsel: {
1901 emit_bcsel(ctx, instr, dst);
1902 break;
1903 }
1904 case nir_op_frsq: {
1905 Temp src = get_alu_src(ctx, instr->src[0]);
1906 if (dst.regClass() == v2b) {
1907 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
1908 } else if (dst.regClass() == v1) {
1909 emit_rsq(ctx, bld, Definition(dst), src);
1910 } else if (dst.regClass() == v2) {
1911 /* Lowered at NIR level for precision reasons. */
1912 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1913 } else {
1914 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1915 nir_print_instr(&instr->instr, stderr);
1916 fprintf(stderr, "\n");
1917 }
1918 break;
1919 }
1920 case nir_op_fneg: {
1921 Temp src = get_alu_src(ctx, instr->src[0]);
1922 if (dst.regClass() == v2b) {
1923 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
1924 } else if (dst.regClass() == v1) {
1925 if (ctx->block->fp_mode.must_flush_denorms32)
1926 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1927 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1928 } else if (dst.regClass() == v2) {
1929 if (ctx->block->fp_mode.must_flush_denorms16_64)
1930 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1931 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1932 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1933 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1934 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1935 } else {
1936 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1937 nir_print_instr(&instr->instr, stderr);
1938 fprintf(stderr, "\n");
1939 }
1940 break;
1941 }
1942 case nir_op_fabs: {
1943 Temp src = get_alu_src(ctx, instr->src[0]);
1944 if (dst.regClass() == v2b) {
1945 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
1946 } else if (dst.regClass() == v1) {
1947 if (ctx->block->fp_mode.must_flush_denorms32)
1948 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1949 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1950 } else if (dst.regClass() == v2) {
1951 if (ctx->block->fp_mode.must_flush_denorms16_64)
1952 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1953 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1954 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1955 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1956 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1957 } else {
1958 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1959 nir_print_instr(&instr->instr, stderr);
1960 fprintf(stderr, "\n");
1961 }
1962 break;
1963 }
1964 case nir_op_fsat: {
1965 Temp src = get_alu_src(ctx, instr->src[0]);
1966 if (dst.regClass() == v2b) {
1967 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
1968 } else if (dst.regClass() == v1) {
1969 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1970 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
1971 // TODO: confirm that this holds under any circumstances
1972 } else if (dst.regClass() == v2) {
1973 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1974 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1975 vop3->clamp = true;
1976 } else {
1977 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1978 nir_print_instr(&instr->instr, stderr);
1979 fprintf(stderr, "\n");
1980 }
1981 break;
1982 }
1983 case nir_op_flog2: {
1984 Temp src = get_alu_src(ctx, instr->src[0]);
1985 if (dst.regClass() == v2b) {
1986 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
1987 } else if (dst.regClass() == v1) {
1988 emit_log2(ctx, bld, Definition(dst), src);
1989 } else {
1990 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1991 nir_print_instr(&instr->instr, stderr);
1992 fprintf(stderr, "\n");
1993 }
1994 break;
1995 }
1996 case nir_op_frcp: {
1997 Temp src = get_alu_src(ctx, instr->src[0]);
1998 if (dst.regClass() == v2b) {
1999 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2000 } else if (dst.regClass() == v1) {
2001 emit_rcp(ctx, bld, Definition(dst), src);
2002 } else if (dst.regClass() == v2) {
2003 /* Lowered at NIR level for precision reasons. */
2004 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2005 } else {
2006 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2007 nir_print_instr(&instr->instr, stderr);
2008 fprintf(stderr, "\n");
2009 }
2010 break;
2011 }
2012 case nir_op_fexp2: {
2013 if (dst.regClass() == v2b) {
2014 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2015 } else if (dst.regClass() == v1) {
2016 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2017 } else {
2018 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2019 nir_print_instr(&instr->instr, stderr);
2020 fprintf(stderr, "\n");
2021 }
2022 break;
2023 }
2024 case nir_op_fsqrt: {
2025 Temp src = get_alu_src(ctx, instr->src[0]);
2026 if (dst.regClass() == v2b) {
2027 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2028 } else if (dst.regClass() == v1) {
2029 emit_sqrt(ctx, bld, Definition(dst), src);
2030 } else if (dst.regClass() == v2) {
2031 /* Lowered at NIR level for precision reasons. */
2032 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2033 } else {
2034 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2035 nir_print_instr(&instr->instr, stderr);
2036 fprintf(stderr, "\n");
2037 }
2038 break;
2039 }
2040 case nir_op_ffract: {
2041 if (dst.regClass() == v2b) {
2042 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2043 } else if (dst.regClass() == v1) {
2044 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2045 } else if (dst.regClass() == v2) {
2046 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2047 } else {
2048 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2049 nir_print_instr(&instr->instr, stderr);
2050 fprintf(stderr, "\n");
2051 }
2052 break;
2053 }
2054 case nir_op_ffloor: {
2055 Temp src = get_alu_src(ctx, instr->src[0]);
2056 if (dst.regClass() == v2b) {
2057 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2058 } else if (dst.regClass() == v1) {
2059 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2060 } else if (dst.regClass() == v2) {
2061 emit_floor_f64(ctx, bld, Definition(dst), src);
2062 } else {
2063 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2064 nir_print_instr(&instr->instr, stderr);
2065 fprintf(stderr, "\n");
2066 }
2067 break;
2068 }
2069 case nir_op_fceil: {
2070 Temp src0 = get_alu_src(ctx, instr->src[0]);
2071 if (dst.regClass() == v2b) {
2072 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2073 } else if (dst.regClass() == v1) {
2074 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2075 } else if (dst.regClass() == v2) {
2076 if (ctx->options->chip_class >= GFX7) {
2077 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2078 } else {
2079 /* GFX6 doesn't support V_CEIL_F64, lower it. */
2080 /* trunc = trunc(src0)
2081 * if (src0 > 0.0 && src0 != trunc)
2082 * trunc += 1.0
2083 */
2084 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2085 Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
2086 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2087 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
2088 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
2089 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
2090 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2091 }
2092 } else {
2093 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2094 nir_print_instr(&instr->instr, stderr);
2095 fprintf(stderr, "\n");
2096 }
2097 break;
2098 }
2099 case nir_op_ftrunc: {
2100 Temp src = get_alu_src(ctx, instr->src[0]);
2101 if (dst.regClass() == v2b) {
2102 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2103 } else if (dst.regClass() == v1) {
2104 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2105 } else if (dst.regClass() == v2) {
2106 emit_trunc_f64(ctx, bld, Definition(dst), src);
2107 } else {
2108 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2109 nir_print_instr(&instr->instr, stderr);
2110 fprintf(stderr, "\n");
2111 }
2112 break;
2113 }
2114 case nir_op_fround_even: {
2115 Temp src0 = get_alu_src(ctx, instr->src[0]);
2116 if (dst.regClass() == v2b) {
2117 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2118 } else if (dst.regClass() == v1) {
2119 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2120 } else if (dst.regClass() == v2) {
2121 if (ctx->options->chip_class >= GFX7) {
2122 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2123 } else {
2124 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2125 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2126 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2127
2128 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
2129 Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
2130 Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2131 Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2132 static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
2133 tmp = sub->definitions[0].getTemp();
2134
2135 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
2136 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2137 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2138 Temp cond = vop3->definitions[0].getTemp();
2139
2140 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2141 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2142 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
2143 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
2144
2145 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2146 }
2147 } else {
2148 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2149 nir_print_instr(&instr->instr, stderr);
2150 fprintf(stderr, "\n");
2151 }
2152 break;
2153 }
2154 case nir_op_fsin:
2155 case nir_op_fcos: {
2156 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2157 aco_ptr<Instruction> norm;
2158 if (dst.regClass() == v2b) {
2159 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
2160 Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2161 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2162 bld.vop1(opcode, Definition(dst), tmp);
2163 } else if (dst.regClass() == v1) {
2164 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
2165 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2166
2167 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2168 if (ctx->options->chip_class < GFX9)
2169 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2170
2171 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2172 bld.vop1(opcode, Definition(dst), tmp);
2173 } else {
2174 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2175 nir_print_instr(&instr->instr, stderr);
2176 fprintf(stderr, "\n");
2177 }
2178 break;
2179 }
2180 case nir_op_ldexp: {
2181 Temp src0 = get_alu_src(ctx, instr->src[0]);
2182 Temp src1 = get_alu_src(ctx, instr->src[1]);
2183 if (dst.regClass() == v2b) {
2184 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2185 } else if (dst.regClass() == v1) {
2186 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
2187 } else if (dst.regClass() == v2) {
2188 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
2189 } else {
2190 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2191 nir_print_instr(&instr->instr, stderr);
2192 fprintf(stderr, "\n");
2193 }
2194 break;
2195 }
2196 case nir_op_frexp_sig: {
2197 Temp src = get_alu_src(ctx, instr->src[0]);
2198 if (dst.regClass() == v2b) {
2199 bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
2200 } else if (dst.regClass() == v1) {
2201 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
2202 } else if (dst.regClass() == v2) {
2203 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
2204 } else {
2205 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2206 nir_print_instr(&instr->instr, stderr);
2207 fprintf(stderr, "\n");
2208 }
2209 break;
2210 }
2211 case nir_op_frexp_exp: {
2212 Temp src = get_alu_src(ctx, instr->src[0]);
2213 if (instr->src[0].src.ssa->bit_size == 16) {
2214 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2215 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
2216 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2217 } else if (instr->src[0].src.ssa->bit_size == 32) {
2218 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
2219 } else if (instr->src[0].src.ssa->bit_size == 64) {
2220 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
2221 } else {
2222 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2223 nir_print_instr(&instr->instr, stderr);
2224 fprintf(stderr, "\n");
2225 }
2226 break;
2227 }
2228 case nir_op_fsign: {
2229 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2230 if (dst.regClass() == v2b) {
2231 Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2232 Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
2233 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2234 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
2235 cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2236 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
2237 } else if (dst.regClass() == v1) {
2238 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2239 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
2240 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2241 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
2242 } else if (dst.regClass() == v2) {
2243 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2244 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
2245 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
2246
2247 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2248 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
2249 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2250
2251 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2252 } else {
2253 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2254 nir_print_instr(&instr->instr, stderr);
2255 fprintf(stderr, "\n");
2256 }
2257 break;
2258 }
2259 case nir_op_f2f16:
2260 case nir_op_f2f16_rtne: {
2261 Temp src = get_alu_src(ctx, instr->src[0]);
2262 if (instr->src[0].src.ssa->bit_size == 64)
2263 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2264 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2265 break;
2266 }
2267 case nir_op_f2f16_rtz: {
2268 Temp src = get_alu_src(ctx, instr->src[0]);
2269 if (instr->src[0].src.ssa->bit_size == 64)
2270 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2271 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
2272 break;
2273 }
2274 case nir_op_f2f32: {
2275 if (instr->src[0].src.ssa->bit_size == 16) {
2276 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2277 } else if (instr->src[0].src.ssa->bit_size == 64) {
2278 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2279 } else {
2280 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2281 nir_print_instr(&instr->instr, stderr);
2282 fprintf(stderr, "\n");
2283 }
2284 break;
2285 }
2286 case nir_op_f2f64: {
2287 Temp src = get_alu_src(ctx, instr->src[0]);
2288 if (instr->src[0].src.ssa->bit_size == 16)
2289 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2290 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2291 break;
2292 }
2293 case nir_op_i2f16: {
2294 assert(dst.regClass() == v2b);
2295 Temp src = get_alu_src(ctx, instr->src[0]);
2296 if (instr->src[0].src.ssa->bit_size == 8)
2297 src = convert_int(ctx, bld, src, 8, 16, true);
2298 else if (instr->src[0].src.ssa->bit_size == 64)
2299 src = convert_int(ctx, bld, src, 64, 32, false);
2300 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2301 break;
2302 }
2303 case nir_op_i2f32: {
2304 assert(dst.size() == 1);
2305 Temp src = get_alu_src(ctx, instr->src[0]);
2306 if (instr->src[0].src.ssa->bit_size <= 16)
2307 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2308 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2309 break;
2310 }
2311 case nir_op_i2f64: {
2312 if (instr->src[0].src.ssa->bit_size <= 32) {
2313 Temp src = get_alu_src(ctx, instr->src[0]);
2314 if (instr->src[0].src.ssa->bit_size <= 16)
2315 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2316 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2317 } else if (instr->src[0].src.ssa->bit_size == 64) {
2318 Temp src = get_alu_src(ctx, instr->src[0]);
2319 RegClass rc = RegClass(src.type(), 1);
2320 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2321 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2322 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2323 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2324 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2325 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper