1d7aae1be2ec72b1acc7ca160eb724ad3d25a0b6
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <stack>
29 #include <map>
30
31 #include "ac_shader_util.h"
32 #include "aco_ir.h"
33 #include "aco_builder.h"
34 #include "aco_interface.h"
35 #include "aco_instruction_selection_setup.cpp"
36 #include "util/fast_idiv_by_const.h"
37
38 namespace aco {
39 namespace {
40
41 class loop_info_RAII {
42 isel_context* ctx;
43 unsigned header_idx_old;
44 Block* exit_old;
45 bool divergent_cont_old;
46 bool divergent_branch_old;
47 bool divergent_if_old;
48
49 public:
50 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
51 : ctx(ctx),
52 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
53 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
54 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
55 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
56 {
57 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
58 ctx->cf_info.parent_loop.exit = loop_exit;
59 ctx->cf_info.parent_loop.has_divergent_continue = false;
60 ctx->cf_info.parent_loop.has_divergent_branch = false;
61 ctx->cf_info.parent_if.is_divergent = false;
62 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
63 }
64
65 ~loop_info_RAII()
66 {
67 ctx->cf_info.parent_loop.header_idx = header_idx_old;
68 ctx->cf_info.parent_loop.exit = exit_old;
69 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
70 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
71 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
72 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
73 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
74 ctx->cf_info.exec_potentially_empty_discard = false;
75 }
76 };
77
78 struct if_context {
79 Temp cond;
80
81 bool divergent_old;
82 bool exec_potentially_empty_discard_old;
83 bool exec_potentially_empty_break_old;
84 uint16_t exec_potentially_empty_break_depth_old;
85
86 unsigned BB_if_idx;
87 unsigned invert_idx;
88 bool uniform_has_then_branch;
89 bool then_branch_divergent;
90 Block BB_invert;
91 Block BB_endif;
92 };
93
94 static bool visit_cf_list(struct isel_context *ctx,
95 struct exec_list *list);
96
97 static void add_logical_edge(unsigned pred_idx, Block *succ)
98 {
99 succ->logical_preds.emplace_back(pred_idx);
100 }
101
102
103 static void add_linear_edge(unsigned pred_idx, Block *succ)
104 {
105 succ->linear_preds.emplace_back(pred_idx);
106 }
107
108 static void add_edge(unsigned pred_idx, Block *succ)
109 {
110 add_logical_edge(pred_idx, succ);
111 add_linear_edge(pred_idx, succ);
112 }
113
114 static void append_logical_start(Block *b)
115 {
116 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
117 }
118
119 static void append_logical_end(Block *b)
120 {
121 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
122 }
123
124 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
125 {
126 assert(ctx->allocated[def->index].id());
127 return ctx->allocated[def->index];
128 }
129
130 Temp emit_mbcnt(isel_context *ctx, Definition dst,
131 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
132 {
133 Builder bld(ctx->program, ctx->block);
134 Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
135 Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
136
137 if (ctx->program->wave_size == 32) {
138 return thread_id_lo;
139 } else if (ctx->program->chip_class <= GFX7) {
140 Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
141 return thread_id_hi;
142 } else {
143 Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
144 return thread_id_hi;
145 }
146 }
147
148 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
149 {
150 Builder bld(ctx->program, ctx->block);
151
152 if (!dst.id())
153 dst = bld.tmp(src.regClass());
154
155 assert(src.size() == dst.size());
156
157 if (ctx->stage != fragment_fs) {
158 if (!dst.id())
159 return src;
160
161 bld.copy(Definition(dst), src);
162 return dst;
163 }
164
165 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
166 ctx->program->needs_wqm |= program_needs_wqm;
167 return dst;
168 }
169
170 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
171 {
172 if (index.regClass() == s1)
173 return bld.readlane(bld.def(s1), data, index);
174
175 if (ctx->options->chip_class <= GFX7) {
176 /* GFX6-7: there is no bpermute instruction */
177 Operand index_op(index);
178 Operand input_data(data);
179 index_op.setLateKill(true);
180 input_data.setLateKill(true);
181
182 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
183 } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
184 /* GFX10 wave64 mode: emulate full-wave bpermute */
185 if (!ctx->has_gfx10_wave64_bpermute) {
186 ctx->has_gfx10_wave64_bpermute = true;
187 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
188 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
189 }
190
191 Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
192 Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
193 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
194 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
195 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
196 Operand input_data(data);
197
198 index_x4.setLateKill(true);
199 input_data.setLateKill(true);
200 same_half.setLateKill(true);
201
202 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
203 } else {
204 /* GFX8-9 or GFX10 wave32: bpermute works normally */
205 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
206 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
207 }
208 }
209
210 static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
211 {
212 if (ctx->options->chip_class >= GFX8) {
213 unsigned and_mask = mask & 0x1f;
214 unsigned or_mask = (mask >> 5) & 0x1f;
215 unsigned xor_mask = (mask >> 10) & 0x1f;
216
217 uint16_t dpp_ctrl = 0xffff;
218
219 // TODO: we could use DPP8 for some swizzles
220 if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
221 unsigned res[4] = {0, 1, 2, 3};
222 for (unsigned i = 0; i < 4; i++)
223 res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
224 dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
225 } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
226 dpp_ctrl = dpp_row_rr(8);
227 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
228 dpp_ctrl = dpp_row_mirror;
229 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
230 dpp_ctrl = dpp_row_half_mirror;
231 }
232
233 if (dpp_ctrl != 0xffff)
234 return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
235 }
236
237 return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
238 }
239
240 Temp as_vgpr(isel_context *ctx, Temp val)
241 {
242 if (val.type() == RegType::sgpr) {
243 Builder bld(ctx->program, ctx->block);
244 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
245 }
246 assert(val.type() == RegType::vgpr);
247 return val;
248 }
249
250 //assumes a != 0xffffffff
251 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
252 {
253 assert(b != 0);
254 Builder bld(ctx->program, ctx->block);
255
256 if (util_is_power_of_two_or_zero(b)) {
257 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
258 return;
259 }
260
261 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
262
263 assert(info.multiplier <= 0xffffffff);
264
265 bool pre_shift = info.pre_shift != 0;
266 bool increment = info.increment != 0;
267 bool multiply = true;
268 bool post_shift = info.post_shift != 0;
269
270 if (!pre_shift && !increment && !multiply && !post_shift) {
271 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
272 return;
273 }
274
275 Temp pre_shift_dst = a;
276 if (pre_shift) {
277 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
278 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
279 }
280
281 Temp increment_dst = pre_shift_dst;
282 if (increment) {
283 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
284 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
285 }
286
287 Temp multiply_dst = increment_dst;
288 if (multiply) {
289 multiply_dst = post_shift ? bld.tmp(v1) : dst;
290 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
291 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
292 }
293
294 if (post_shift) {
295 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
296 }
297 }
298
299 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
300 {
301 Builder bld(ctx->program, ctx->block);
302 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
303 }
304
305
306 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
307 {
308 /* no need to extract the whole vector */
309 if (src.regClass() == dst_rc) {
310 assert(idx == 0);
311 return src;
312 }
313
314 assert(src.bytes() > (idx * dst_rc.bytes()));
315 Builder bld(ctx->program, ctx->block);
316 auto it = ctx->allocated_vec.find(src.id());
317 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
318 if (it->second[idx].regClass() == dst_rc) {
319 return it->second[idx];
320 } else {
321 assert(!dst_rc.is_subdword());
322 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
323 return bld.copy(bld.def(dst_rc), it->second[idx]);
324 }
325 }
326
327 if (dst_rc.is_subdword())
328 src = as_vgpr(ctx, src);
329
330 if (src.bytes() == dst_rc.bytes()) {
331 assert(idx == 0);
332 return bld.copy(bld.def(dst_rc), src);
333 } else {
334 Temp dst = bld.tmp(dst_rc);
335 emit_extract_vector(ctx, src, idx, dst);
336 return dst;
337 }
338 }
339
340 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
341 {
342 if (num_components == 1)
343 return;
344 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
345 return;
346 RegClass rc;
347 if (num_components > vec_src.size()) {
348 if (vec_src.type() == RegType::sgpr) {
349 /* should still help get_alu_src() */
350 emit_split_vector(ctx, vec_src, vec_src.size());
351 return;
352 }
353 /* sub-dword split */
354 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
355 } else {
356 rc = RegClass(vec_src.type(), vec_src.size() / num_components);
357 }
358 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
359 split->operands[0] = Operand(vec_src);
360 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
361 for (unsigned i = 0; i < num_components; i++) {
362 elems[i] = {ctx->program->allocateId(), rc};
363 split->definitions[i] = Definition(elems[i]);
364 }
365 ctx->block->instructions.emplace_back(std::move(split));
366 ctx->allocated_vec.emplace(vec_src.id(), elems);
367 }
368
369 /* This vector expansion uses a mask to determine which elements in the new vector
370 * come from the original vector. The other elements are undefined. */
371 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
372 {
373 emit_split_vector(ctx, vec_src, util_bitcount(mask));
374
375 if (vec_src == dst)
376 return;
377
378 Builder bld(ctx->program, ctx->block);
379 if (num_components == 1) {
380 if (dst.type() == RegType::sgpr)
381 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
382 else
383 bld.copy(Definition(dst), vec_src);
384 return;
385 }
386
387 unsigned component_size = dst.size() / num_components;
388 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
389
390 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
391 vec->definitions[0] = Definition(dst);
392 unsigned k = 0;
393 for (unsigned i = 0; i < num_components; i++) {
394 if (mask & (1 << i)) {
395 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
396 if (dst.type() == RegType::sgpr)
397 src = bld.as_uniform(src);
398 vec->operands[i] = Operand(src);
399 } else {
400 vec->operands[i] = Operand(0u);
401 }
402 elems[i] = vec->operands[i].getTemp();
403 }
404 ctx->block->instructions.emplace_back(std::move(vec));
405 ctx->allocated_vec.emplace(dst.id(), elems);
406 }
407
408 /* adjust misaligned small bit size loads */
409 void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
410 {
411 Builder bld(ctx->program, ctx->block);
412 Operand shift;
413 Temp select = Temp();
414 if (offset.isConstant()) {
415 assert(offset.constantValue() && offset.constantValue() < 4);
416 shift = Operand(offset.constantValue() * 8);
417 } else {
418 /* bit_offset = 8 * (offset & 0x3) */
419 Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
420 select = bld.tmp(s1);
421 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
422 }
423
424 if (vec.size() == 1) {
425 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
426 } else if (vec.size() == 2) {
427 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
428 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
429 if (tmp == dst)
430 emit_split_vector(ctx, dst, 2);
431 else
432 emit_extract_vector(ctx, tmp, 0, dst);
433 } else if (vec.size() == 4) {
434 Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
435 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
436 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
437 if (select != Temp())
438 hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
439 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
440 Temp mid = bld.tmp(s1);
441 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
442 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
443 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
444 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
445 emit_split_vector(ctx, dst, 2);
446 }
447 }
448
449 void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
450 {
451 Builder bld(ctx->program, ctx->block);
452 if (offset.isTemp()) {
453 Temp tmp[4] = {vec, vec, vec, vec};
454
455 if (vec.size() == 4) {
456 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
457 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
458 } else if (vec.size() == 3) {
459 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
460 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
461 } else if (vec.size() == 2) {
462 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
463 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
464 }
465 for (unsigned i = 0; i < dst.size(); i++)
466 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
467
468 vec = tmp[0];
469 if (dst.size() == 2)
470 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
471
472 offset = Operand(0u);
473 }
474
475 unsigned num_components = vec.bytes() / component_size;
476 if (vec.regClass() == dst.regClass()) {
477 assert(offset.constantValue() == 0);
478 bld.copy(Definition(dst), vec);
479 emit_split_vector(ctx, dst, num_components);
480 return;
481 }
482
483 emit_split_vector(ctx, vec, num_components);
484 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
485 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
486
487 assert(offset.constantValue() % component_size == 0);
488 unsigned skip = offset.constantValue() / component_size;
489 for (unsigned i = skip; i < num_components; i++)
490 elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
491
492 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
493 if (dst.type() == RegType::vgpr) {
494 num_components = dst.bytes() / component_size;
495 aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
496 for (unsigned i = 0; i < num_components; i++)
497 create_vec->operands[i] = Operand(elems[i]);
498 create_vec->definitions[0] = Definition(dst);
499 bld.insert(std::move(create_vec));
500
501 /* if dst is sgpr - split the src, but move the original to sgpr. */
502 } else if (skip) {
503 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
504 byte_align_scalar(ctx, vec, offset, dst);
505 } else {
506 assert(dst.size() == vec.size());
507 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
508 }
509
510 ctx->allocated_vec.emplace(dst.id(), elems);
511 }
512
513 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
514 {
515 Builder bld(ctx->program, ctx->block);
516 if (!dst.id())
517 dst = bld.tmp(bld.lm);
518
519 assert(val.regClass() == s1);
520 assert(dst.regClass() == bld.lm);
521
522 return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
523 }
524
525 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
526 {
527 Builder bld(ctx->program, ctx->block);
528 if (!dst.id())
529 dst = bld.tmp(s1);
530
531 assert(val.regClass() == bld.lm);
532 assert(dst.regClass() == s1);
533
534 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
535 Temp tmp = bld.tmp(s1);
536 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
537 return emit_wqm(ctx, tmp, dst);
538 }
539
540 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
541 {
542 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
543 return get_ssa_temp(ctx, src.src.ssa);
544
545 if (src.src.ssa->num_components == size) {
546 bool identity_swizzle = true;
547 for (unsigned i = 0; identity_swizzle && i < size; i++) {
548 if (src.swizzle[i] != i)
549 identity_swizzle = false;
550 }
551 if (identity_swizzle)
552 return get_ssa_temp(ctx, src.src.ssa);
553 }
554
555 Temp vec = get_ssa_temp(ctx, src.src.ssa);
556 unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
557 assert(elem_size > 0);
558 assert(vec.bytes() % elem_size == 0);
559
560 if (elem_size < 4 && vec.type() == RegType::sgpr) {
561 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
562 assert(size == 1);
563 unsigned swizzle = src.swizzle[0];
564 if (vec.size() > 1) {
565 assert(src.src.ssa->bit_size == 16);
566 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
567 swizzle = swizzle & 1;
568 }
569 if (swizzle == 0)
570 return vec;
571
572 Temp dst{ctx->program->allocateId(), s1};
573 aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
574 bfe->operands[0] = Operand(vec);
575 bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
576 bfe->definitions[0] = Definition(dst);
577 bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
578 ctx->block->instructions.emplace_back(std::move(bfe));
579 return dst;
580 }
581
582 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
583 if (size == 1) {
584 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
585 } else {
586 assert(size <= 4);
587 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
588 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
589 for (unsigned i = 0; i < size; ++i) {
590 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
591 vec_instr->operands[i] = Operand{elems[i]};
592 }
593 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
594 vec_instr->definitions[0] = Definition(dst);
595 ctx->block->instructions.emplace_back(std::move(vec_instr));
596 ctx->allocated_vec.emplace(dst.id(), elems);
597 return dst;
598 }
599 }
600
601 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
602 {
603 if (ptr.size() == 2)
604 return ptr;
605 Builder bld(ctx->program, ctx->block);
606 if (ptr.type() == RegType::vgpr)
607 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
608 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
609 ptr, Operand((unsigned)ctx->options->address32_hi));
610 }
611
612 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
613 {
614 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
615 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
616 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
617 sop2->definitions[0] = Definition(dst);
618 if (instr->no_unsigned_wrap)
619 sop2->definitions[0].setNUW(true);
620 if (writes_scc)
621 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
622 ctx->block->instructions.emplace_back(std::move(sop2));
623 }
624
625 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
626 bool commutative, bool swap_srcs=false, bool flush_denorms = false)
627 {
628 Builder bld(ctx->program, ctx->block);
629 bld.is_precise = instr->exact;
630
631 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
632 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
633 if (src1.type() == RegType::sgpr) {
634 if (commutative && src0.type() == RegType::vgpr) {
635 Temp t = src0;
636 src0 = src1;
637 src1 = t;
638 } else {
639 src1 = as_vgpr(ctx, src1);
640 }
641 }
642
643 if (flush_denorms && ctx->program->chip_class < GFX9) {
644 assert(dst.size() == 1);
645 Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
646 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
647 } else {
648 bld.vop2(op, Definition(dst), src0, src1);
649 }
650 }
651
652 void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
653 aco_opcode op, Temp dst)
654 {
655 Builder bld(ctx->program, ctx->block);
656 bld.is_precise = instr->exact;
657
658 Temp src0 = get_alu_src(ctx, instr->src[0]);
659 Temp src1 = get_alu_src(ctx, instr->src[1]);
660
661 if (src1.type() == RegType::sgpr) {
662 assert(src0.type() == RegType::vgpr);
663 std::swap(src0, src1);
664 }
665
666 Temp src00 = bld.tmp(src0.type(), 1);
667 Temp src01 = bld.tmp(src0.type(), 1);
668 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
669 Temp src10 = bld.tmp(v1);
670 Temp src11 = bld.tmp(v1);
671 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
672 Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
673 Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
674 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
675 }
676
677 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
678 bool flush_denorms = false)
679 {
680 Temp src0 = get_alu_src(ctx, instr->src[0]);
681 Temp src1 = get_alu_src(ctx, instr->src[1]);
682 Temp src2 = get_alu_src(ctx, instr->src[2]);
683
684 /* ensure that the instruction has at most 1 sgpr operand
685 * The optimizer will inline constants for us */
686 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
687 src0 = as_vgpr(ctx, src0);
688 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
689 src1 = as_vgpr(ctx, src1);
690 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
691 src2 = as_vgpr(ctx, src2);
692
693 Builder bld(ctx->program, ctx->block);
694 bld.is_precise = instr->exact;
695 if (flush_denorms && ctx->program->chip_class < GFX9) {
696 assert(dst.size() == 1);
697 Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
698 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
699 } else {
700 bld.vop3(op, Definition(dst), src0, src1, src2);
701 }
702 }
703
704 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
705 {
706 Builder bld(ctx->program, ctx->block);
707 bld.is_precise = instr->exact;
708 if (dst.type() == RegType::sgpr)
709 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
710 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
711 else
712 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
713 }
714
715 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
716 {
717 Temp src0 = get_alu_src(ctx, instr->src[0]);
718 Temp src1 = get_alu_src(ctx, instr->src[1]);
719 assert(src0.size() == src1.size());
720
721 aco_ptr<Instruction> vopc;
722 if (src1.type() == RegType::sgpr) {
723 if (src0.type() == RegType::vgpr) {
724 /* to swap the operands, we might also have to change the opcode */
725 switch (op) {
726 case aco_opcode::v_cmp_lt_f16:
727 op = aco_opcode::v_cmp_gt_f16;
728 break;
729 case aco_opcode::v_cmp_ge_f16:
730 op = aco_opcode::v_cmp_le_f16;
731 break;
732 case aco_opcode::v_cmp_lt_i16:
733 op = aco_opcode::v_cmp_gt_i16;
734 break;
735 case aco_opcode::v_cmp_ge_i16:
736 op = aco_opcode::v_cmp_le_i16;
737 break;
738 case aco_opcode::v_cmp_lt_u16:
739 op = aco_opcode::v_cmp_gt_u16;
740 break;
741 case aco_opcode::v_cmp_ge_u16:
742 op = aco_opcode::v_cmp_le_u16;
743 break;
744 case aco_opcode::v_cmp_lt_f32:
745 op = aco_opcode::v_cmp_gt_f32;
746 break;
747 case aco_opcode::v_cmp_ge_f32:
748 op = aco_opcode::v_cmp_le_f32;
749 break;
750 case aco_opcode::v_cmp_lt_i32:
751 op = aco_opcode::v_cmp_gt_i32;
752 break;
753 case aco_opcode::v_cmp_ge_i32:
754 op = aco_opcode::v_cmp_le_i32;
755 break;
756 case aco_opcode::v_cmp_lt_u32:
757 op = aco_opcode::v_cmp_gt_u32;
758 break;
759 case aco_opcode::v_cmp_ge_u32:
760 op = aco_opcode::v_cmp_le_u32;
761 break;
762 case aco_opcode::v_cmp_lt_f64:
763 op = aco_opcode::v_cmp_gt_f64;
764 break;
765 case aco_opcode::v_cmp_ge_f64:
766 op = aco_opcode::v_cmp_le_f64;
767 break;
768 case aco_opcode::v_cmp_lt_i64:
769 op = aco_opcode::v_cmp_gt_i64;
770 break;
771 case aco_opcode::v_cmp_ge_i64:
772 op = aco_opcode::v_cmp_le_i64;
773 break;
774 case aco_opcode::v_cmp_lt_u64:
775 op = aco_opcode::v_cmp_gt_u64;
776 break;
777 case aco_opcode::v_cmp_ge_u64:
778 op = aco_opcode::v_cmp_le_u64;
779 break;
780 default: /* eq and ne are commutative */
781 break;
782 }
783 Temp t = src0;
784 src0 = src1;
785 src1 = t;
786 } else {
787 src1 = as_vgpr(ctx, src1);
788 }
789 }
790
791 Builder bld(ctx->program, ctx->block);
792 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
793 }
794
795 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
796 {
797 Temp src0 = get_alu_src(ctx, instr->src[0]);
798 Temp src1 = get_alu_src(ctx, instr->src[1]);
799 Builder bld(ctx->program, ctx->block);
800
801 assert(dst.regClass() == bld.lm);
802 assert(src0.type() == RegType::sgpr);
803 assert(src1.type() == RegType::sgpr);
804 assert(src0.regClass() == src1.regClass());
805
806 /* Emit the SALU comparison instruction */
807 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
808 /* Turn the result into a per-lane bool */
809 bool_to_vector_condition(ctx, cmp, dst);
810 }
811
812 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
813 aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
814 {
815 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
816 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
817 bool use_valu = s_op == aco_opcode::num_opcodes ||
818 nir_dest_is_divergent(instr->dest.dest) ||
819 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
820 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
821 aco_opcode op = use_valu ? v_op : s_op;
822 assert(op != aco_opcode::num_opcodes);
823 assert(dst.regClass() == ctx->program->lane_mask);
824
825 if (use_valu)
826 emit_vopc_instruction(ctx, instr, op, dst);
827 else
828 emit_sopc_instruction(ctx, instr, op, dst);
829 }
830
831 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
832 {
833 Builder bld(ctx->program, ctx->block);
834 Temp src0 = get_alu_src(ctx, instr->src[0]);
835 Temp src1 = get_alu_src(ctx, instr->src[1]);
836
837 assert(dst.regClass() == bld.lm);
838 assert(src0.regClass() == bld.lm);
839 assert(src1.regClass() == bld.lm);
840
841 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
842 }
843
844 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
845 {
846 Builder bld(ctx->program, ctx->block);
847 Temp cond = get_alu_src(ctx, instr->src[0]);
848 Temp then = get_alu_src(ctx, instr->src[1]);
849 Temp els = get_alu_src(ctx, instr->src[2]);
850
851 assert(cond.regClass() == bld.lm);
852
853 if (dst.type() == RegType::vgpr) {
854 aco_ptr<Instruction> bcsel;
855 if (dst.size() == 1) {
856 then = as_vgpr(ctx, then);
857 els = as_vgpr(ctx, els);
858
859 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
860 } else if (dst.size() == 2) {
861 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
862 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
863 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
864 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
865
866 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
867 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
868
869 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
870 } else {
871 fprintf(stderr, "Unimplemented NIR instr bit size: ");
872 nir_print_instr(&instr->instr, stderr);
873 fprintf(stderr, "\n");
874 }
875 return;
876 }
877
878 if (instr->dest.dest.ssa.bit_size == 1) {
879 assert(dst.regClass() == bld.lm);
880 assert(then.regClass() == bld.lm);
881 assert(els.regClass() == bld.lm);
882 }
883
884 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
885 if (dst.regClass() == s1 || dst.regClass() == s2) {
886 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
887 assert(dst.size() == then.size());
888 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
889 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
890 } else {
891 fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
892 nir_print_instr(&instr->instr, stderr);
893 fprintf(stderr, "\n");
894 }
895 return;
896 }
897
898 /* divergent boolean bcsel
899 * this implements bcsel on bools: dst = s0 ? s1 : s2
900 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
901 assert(instr->dest.dest.ssa.bit_size == 1);
902
903 if (cond.id() != then.id())
904 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
905
906 if (cond.id() == els.id())
907 bld.sop1(Builder::s_mov, Definition(dst), then);
908 else
909 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
910 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
911 }
912
913 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
914 aco_opcode op, uint32_t undo)
915 {
916 /* multiply by 16777216 to handle denormals */
917 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
918 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
919 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
920 scaled = bld.vop1(op, bld.def(v1), scaled);
921 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
922
923 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
924
925 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
926 }
927
928 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
929 {
930 if (ctx->block->fp_mode.denorm32 == 0) {
931 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
932 return;
933 }
934
935 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
936 }
937
938 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
939 {
940 if (ctx->block->fp_mode.denorm32 == 0) {
941 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
942 return;
943 }
944
945 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
946 }
947
948 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
949 {
950 if (ctx->block->fp_mode.denorm32 == 0) {
951 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
952 return;
953 }
954
955 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
956 }
957
958 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
959 {
960 if (ctx->block->fp_mode.denorm32 == 0) {
961 bld.vop1(aco_opcode::v_log_f32, dst, val);
962 return;
963 }
964
965 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
966 }
967
968 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
969 {
970 if (ctx->options->chip_class >= GFX7)
971 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
972
973 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
974 /* TODO: create more efficient code! */
975 if (val.type() == RegType::sgpr)
976 val = as_vgpr(ctx, val);
977
978 /* Split the input value. */
979 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
980 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
981
982 /* Extract the exponent and compute the unbiased value. */
983 Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
984 exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
985
986 /* Extract the fractional part. */
987 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
988 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
989
990 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
991 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
992
993 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
994 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
995 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
996 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
997 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
998
999 /* Get the sign bit. */
1000 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
1001
1002 /* Decide the operation to apply depending on the unbiased exponent. */
1003 Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
1004 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
1005 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1006 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
1007 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1008 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1009
1010 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1011 }
1012
1013 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1014 {
1015 if (ctx->options->chip_class >= GFX7)
1016 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1017
1018 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1019 * lowered at NIR level for precision reasons). */
1020 Temp src0 = as_vgpr(ctx, val);
1021
1022 Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
1023 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
1024
1025 Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1026 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1027 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1028
1029 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1030 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1031 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1032 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1033
1034 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1035 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1036
1037 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1038
1039 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1040 static_cast<VOP3A_instruction*>(add)->neg[1] = true;
1041
1042 return add->definitions[0].getTemp();
1043 }
1044
1045 Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
1046 if (!dst.id()) {
1047 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
1048 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
1049 else
1050 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
1051 }
1052
1053 if (dst.bytes() == src.bytes() && dst_bits < src_bits)
1054 return bld.copy(Definition(dst), src);
1055 else if (dst.bytes() < src.bytes())
1056 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
1057
1058 Temp tmp = dst;
1059 if (dst_bits == 64)
1060 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
1061
1062 if (tmp == src) {
1063 } else if (src.regClass() == s1) {
1064 if (is_signed)
1065 bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
1066 else
1067 bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
1068 } else if (ctx->options->chip_class >= GFX8) {
1069 assert(src_bits != 8 || src.regClass() == v1b);
1070 assert(src_bits != 16 || src.regClass() == v2b);
1071 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
1072 sdwa->operands[0] = Operand(src);
1073 sdwa->definitions[0] = Definition(tmp);
1074 if (is_signed)
1075 sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
1076 else
1077 sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
1078 sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
1079 bld.insert(std::move(sdwa));
1080 } else {
1081 assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
1082 aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
1083 bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
1084 }
1085
1086 if (dst_bits == 64) {
1087 if (is_signed && dst.regClass() == s2) {
1088 Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
1089 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1090 } else if (is_signed && dst.regClass() == v2) {
1091 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
1092 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1093 } else {
1094 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
1095 }
1096 }
1097
1098 return dst;
1099 }
1100
1101 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
1102 {
1103 if (!instr->dest.dest.is_ssa) {
1104 fprintf(stderr, "nir alu dst not in ssa: ");
1105 nir_print_instr(&instr->instr, stderr);
1106 fprintf(stderr, "\n");
1107 abort();
1108 }
1109 Builder bld(ctx->program, ctx->block);
1110 bld.is_precise = instr->exact;
1111 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1112 switch(instr->op) {
1113 case nir_op_vec2:
1114 case nir_op_vec3:
1115 case nir_op_vec4: {
1116 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
1117 unsigned num = instr->dest.dest.ssa.num_components;
1118 for (unsigned i = 0; i < num; ++i)
1119 elems[i] = get_alu_src(ctx, instr->src[i]);
1120
1121 if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1122 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1123 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1124 for (unsigned i = 0; i < num; ++i) {
1125 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1126 vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
1127 else
1128 vec->operands[i] = Operand{elems[i]};
1129 }
1130 vec->definitions[0] = Definition(dst);
1131 ctx->block->instructions.emplace_back(std::move(vec));
1132 ctx->allocated_vec.emplace(dst.id(), elems);
1133 } else {
1134 // TODO: that is a bit suboptimal..
1135 Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
1136 for (unsigned i = 0; i < num - 1; ++i)
1137 if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
1138 elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1139 for (unsigned i = 0; i < num; ++i) {
1140 unsigned bit = i * instr->dest.dest.ssa.bit_size;
1141 if (bit % 32 == 0) {
1142 elems[bit / 32] = elems[i];
1143 } else {
1144 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
1145 elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
1146 elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
1147 }
1148 }
1149 if (dst.size() == 1)
1150 bld.copy(Definition(dst), elems[0]);
1151 else
1152 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
1153 }
1154 break;
1155 }
1156 case nir_op_mov: {
1157 Temp src = get_alu_src(ctx, instr->src[0]);
1158 aco_ptr<Instruction> mov;
1159 if (dst.type() == RegType::sgpr) {
1160 if (src.type() == RegType::vgpr)
1161 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1162 else if (src.regClass() == s1)
1163 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
1164 else if (src.regClass() == s2)
1165 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
1166 else
1167 unreachable("wrong src register class for nir_op_imov");
1168 } else {
1169 if (dst.regClass() == v1)
1170 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
1171 else if (dst.regClass() == v1b ||
1172 dst.regClass() == v2b ||
1173 dst.regClass() == v2)
1174 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
1175 else
1176 unreachable("wrong src register class for nir_op_imov");
1177 }
1178 break;
1179 }
1180 case nir_op_inot: {
1181 Temp src = get_alu_src(ctx, instr->src[0]);
1182 if (instr->dest.dest.ssa.bit_size == 1) {
1183 assert(src.regClass() == bld.lm);
1184 assert(dst.regClass() == bld.lm);
1185 /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1186 Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1187 bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1188 } else if (dst.regClass() == v1) {
1189 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1190 } else if (dst.regClass() == v2) {
1191 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1192 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1193 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1194 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1195 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1196 } else if (dst.type() == RegType::sgpr) {
1197 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1198 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1199 } else {
1200 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1201 nir_print_instr(&instr->instr, stderr);
1202 fprintf(stderr, "\n");
1203 }
1204 break;
1205 }
1206 case nir_op_ineg: {
1207 Temp src = get_alu_src(ctx, instr->src[0]);
1208 if (dst.regClass() == v1) {
1209 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
1210 } else if (dst.regClass() == s1) {
1211 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
1212 } else if (dst.size() == 2) {
1213 Temp src0 = bld.tmp(dst.type(), 1);
1214 Temp src1 = bld.tmp(dst.type(), 1);
1215 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
1216
1217 if (dst.regClass() == s2) {
1218 Temp carry = bld.tmp(s1);
1219 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
1220 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
1221 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1222 } else {
1223 Temp lower = bld.tmp(v1);
1224 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
1225 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
1226 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1227 }
1228 } else {
1229 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1230 nir_print_instr(&instr->instr, stderr);
1231 fprintf(stderr, "\n");
1232 }
1233 break;
1234 }
1235 case nir_op_iabs: {
1236 if (dst.regClass() == s1) {
1237 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
1238 } else if (dst.regClass() == v1) {
1239 Temp src = get_alu_src(ctx, instr->src[0]);
1240 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
1241 } else {
1242 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1243 nir_print_instr(&instr->instr, stderr);
1244 fprintf(stderr, "\n");
1245 }
1246 break;
1247 }
1248 case nir_op_isign: {
1249 Temp src = get_alu_src(ctx, instr->src[0]);
1250 if (dst.regClass() == s1) {
1251 Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
1252 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
1253 } else if (dst.regClass() == s2) {
1254 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1255 Temp neqz;
1256 if (ctx->program->chip_class >= GFX8)
1257 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
1258 else
1259 neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
1260 /* SCC gets zero-extended to 64 bit */
1261 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1262 } else if (dst.regClass() == v1) {
1263 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
1264 } else if (dst.regClass() == v2) {
1265 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1266 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1267 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1268 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
1269 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
1270 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1271 } else {
1272 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1273 nir_print_instr(&instr->instr, stderr);
1274 fprintf(stderr, "\n");
1275 }
1276 break;
1277 }
1278 case nir_op_imax: {
1279 if (dst.regClass() == v1) {
1280 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1281 } else if (dst.regClass() == s1) {
1282 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1283 } else {
1284 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1285 nir_print_instr(&instr->instr, stderr);
1286 fprintf(stderr, "\n");
1287 }
1288 break;
1289 }
1290 case nir_op_umax: {
1291 if (dst.regClass() == v1) {
1292 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1293 } else if (dst.regClass() == s1) {
1294 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1295 } else {
1296 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1297 nir_print_instr(&instr->instr, stderr);
1298 fprintf(stderr, "\n");
1299 }
1300 break;
1301 }
1302 case nir_op_imin: {
1303 if (dst.regClass() == v1) {
1304 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1305 } else if (dst.regClass() == s1) {
1306 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1307 } else {
1308 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1309 nir_print_instr(&instr->instr, stderr);
1310 fprintf(stderr, "\n");
1311 }
1312 break;
1313 }
1314 case nir_op_umin: {
1315 if (dst.regClass() == v1) {
1316 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1317 } else if (dst.regClass() == s1) {
1318 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1319 } else {
1320 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1321 nir_print_instr(&instr->instr, stderr);
1322 fprintf(stderr, "\n");
1323 }
1324 break;
1325 }
1326 case nir_op_ior: {
1327 if (instr->dest.dest.ssa.bit_size == 1) {
1328 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1329 } else if (dst.regClass() == v1) {
1330 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1331 } else if (dst.regClass() == v2) {
1332 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1333 } else if (dst.regClass() == s1) {
1334 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1335 } else if (dst.regClass() == s2) {
1336 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1337 } else {
1338 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1339 nir_print_instr(&instr->instr, stderr);
1340 fprintf(stderr, "\n");
1341 }
1342 break;
1343 }
1344 case nir_op_iand: {
1345 if (instr->dest.dest.ssa.bit_size == 1) {
1346 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1347 } else if (dst.regClass() == v1) {
1348 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1349 } else if (dst.regClass() == v2) {
1350 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1351 } else if (dst.regClass() == s1) {
1352 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1353 } else if (dst.regClass() == s2) {
1354 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1355 } else {
1356 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1357 nir_print_instr(&instr->instr, stderr);
1358 fprintf(stderr, "\n");
1359 }
1360 break;
1361 }
1362 case nir_op_ixor: {
1363 if (instr->dest.dest.ssa.bit_size == 1) {
1364 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1365 } else if (dst.regClass() == v1) {
1366 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1367 } else if (dst.regClass() == v2) {
1368 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1369 } else if (dst.regClass() == s1) {
1370 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1371 } else if (dst.regClass() == s2) {
1372 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1373 } else {
1374 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1375 nir_print_instr(&instr->instr, stderr);
1376 fprintf(stderr, "\n");
1377 }
1378 break;
1379 }
1380 case nir_op_ushr: {
1381 if (dst.regClass() == v1) {
1382 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1383 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1384 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1385 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1386 } else if (dst.regClass() == v2) {
1387 bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1388 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1389 } else if (dst.regClass() == s2) {
1390 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1391 } else if (dst.regClass() == s1) {
1392 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1393 } else {
1394 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1395 nir_print_instr(&instr->instr, stderr);
1396 fprintf(stderr, "\n");
1397 }
1398 break;
1399 }
1400 case nir_op_ishl: {
1401 if (dst.regClass() == v1) {
1402 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1403 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1404 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1405 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1406 } else if (dst.regClass() == v2) {
1407 bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1408 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1409 } else if (dst.regClass() == s1) {
1410 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1411 } else if (dst.regClass() == s2) {
1412 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1413 } else {
1414 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1415 nir_print_instr(&instr->instr, stderr);
1416 fprintf(stderr, "\n");
1417 }
1418 break;
1419 }
1420 case nir_op_ishr: {
1421 if (dst.regClass() == v1) {
1422 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1423 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1424 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1425 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1426 } else if (dst.regClass() == v2) {
1427 bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1428 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1429 } else if (dst.regClass() == s1) {
1430 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1431 } else if (dst.regClass() == s2) {
1432 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1433 } else {
1434 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1435 nir_print_instr(&instr->instr, stderr);
1436 fprintf(stderr, "\n");
1437 }
1438 break;
1439 }
1440 case nir_op_find_lsb: {
1441 Temp src = get_alu_src(ctx, instr->src[0]);
1442 if (src.regClass() == s1) {
1443 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1444 } else if (src.regClass() == v1) {
1445 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1446 } else if (src.regClass() == s2) {
1447 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1448 } else {
1449 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1450 nir_print_instr(&instr->instr, stderr);
1451 fprintf(stderr, "\n");
1452 }
1453 break;
1454 }
1455 case nir_op_ufind_msb:
1456 case nir_op_ifind_msb: {
1457 Temp src = get_alu_src(ctx, instr->src[0]);
1458 if (src.regClass() == s1 || src.regClass() == s2) {
1459 aco_opcode op = src.regClass() == s2 ?
1460 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1461 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1462 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1463
1464 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1465 Operand(src.size() * 32u - 1u), msb_rev);
1466 Temp msb = sub.def(0).getTemp();
1467 Temp carry = sub.def(1).getTemp();
1468
1469 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1470 } else if (src.regClass() == v1) {
1471 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1472 Temp msb_rev = bld.tmp(v1);
1473 emit_vop1_instruction(ctx, instr, op, msb_rev);
1474 Temp msb = bld.tmp(v1);
1475 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1476 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1477 } else {
1478 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1479 nir_print_instr(&instr->instr, stderr);
1480 fprintf(stderr, "\n");
1481 }
1482 break;
1483 }
1484 case nir_op_bitfield_reverse: {
1485 if (dst.regClass() == s1) {
1486 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1487 } else if (dst.regClass() == v1) {
1488 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1489 } else {
1490 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1491 nir_print_instr(&instr->instr, stderr);
1492 fprintf(stderr, "\n");
1493 }
1494 break;
1495 }
1496 case nir_op_iadd: {
1497 if (dst.regClass() == s1) {
1498 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1499 break;
1500 }
1501
1502 Temp src0 = get_alu_src(ctx, instr->src[0]);
1503 Temp src1 = get_alu_src(ctx, instr->src[1]);
1504 if (dst.regClass() == v1) {
1505 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1506 break;
1507 }
1508
1509 assert(src0.size() == 2 && src1.size() == 2);
1510 Temp src00 = bld.tmp(src0.type(), 1);
1511 Temp src01 = bld.tmp(dst.type(), 1);
1512 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1513 Temp src10 = bld.tmp(src1.type(), 1);
1514 Temp src11 = bld.tmp(dst.type(), 1);
1515 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1516
1517 if (dst.regClass() == s2) {
1518 Temp carry = bld.tmp(s1);
1519 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1520 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1521 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1522 } else if (dst.regClass() == v2) {
1523 Temp dst0 = bld.tmp(v1);
1524 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1525 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1526 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1527 } else {
1528 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1529 nir_print_instr(&instr->instr, stderr);
1530 fprintf(stderr, "\n");
1531 }
1532 break;
1533 }
1534 case nir_op_uadd_sat: {
1535 Temp src0 = get_alu_src(ctx, instr->src[0]);
1536 Temp src1 = get_alu_src(ctx, instr->src[1]);
1537 if (dst.regClass() == s1) {
1538 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1539 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1540 src0, src1);
1541 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1542 } else if (dst.regClass() == v1) {
1543 if (ctx->options->chip_class >= GFX9) {
1544 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1545 add->operands[0] = Operand(src0);
1546 add->operands[1] = Operand(src1);
1547 add->definitions[0] = Definition(dst);
1548 add->clamp = 1;
1549 ctx->block->instructions.emplace_back(std::move(add));
1550 } else {
1551 if (src1.regClass() != v1)
1552 std::swap(src0, src1);
1553 assert(src1.regClass() == v1);
1554 Temp tmp = bld.tmp(v1);
1555 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1556 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1557 }
1558 } else {
1559 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1560 nir_print_instr(&instr->instr, stderr);
1561 fprintf(stderr, "\n");
1562 }
1563 break;
1564 }
1565 case nir_op_uadd_carry: {
1566 Temp src0 = get_alu_src(ctx, instr->src[0]);
1567 Temp src1 = get_alu_src(ctx, instr->src[1]);
1568 if (dst.regClass() == s1) {
1569 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1570 break;
1571 }
1572 if (dst.regClass() == v1) {
1573 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1574 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1575 break;
1576 }
1577
1578 Temp src00 = bld.tmp(src0.type(), 1);
1579 Temp src01 = bld.tmp(dst.type(), 1);
1580 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1581 Temp src10 = bld.tmp(src1.type(), 1);
1582 Temp src11 = bld.tmp(dst.type(), 1);
1583 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1584 if (dst.regClass() == s2) {
1585 Temp carry = bld.tmp(s1);
1586 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1587 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1588 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1589 } else if (dst.regClass() == v2) {
1590 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1591 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1592 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1593 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1594 } else {
1595 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1596 nir_print_instr(&instr->instr, stderr);
1597 fprintf(stderr, "\n");
1598 }
1599 break;
1600 }
1601 case nir_op_isub: {
1602 if (dst.regClass() == s1) {
1603 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1604 break;
1605 }
1606
1607 Temp src0 = get_alu_src(ctx, instr->src[0]);
1608 Temp src1 = get_alu_src(ctx, instr->src[1]);
1609 if (dst.regClass() == v1) {
1610 bld.vsub32(Definition(dst), src0, src1);
1611 break;
1612 }
1613
1614 Temp src00 = bld.tmp(src0.type(), 1);
1615 Temp src01 = bld.tmp(dst.type(), 1);
1616 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1617 Temp src10 = bld.tmp(src1.type(), 1);
1618 Temp src11 = bld.tmp(dst.type(), 1);
1619 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1620 if (dst.regClass() == s2) {
1621 Temp carry = bld.tmp(s1);
1622 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1623 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1624 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1625 } else if (dst.regClass() == v2) {
1626 Temp lower = bld.tmp(v1);
1627 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1628 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1629 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1630 } else {
1631 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1632 nir_print_instr(&instr->instr, stderr);
1633 fprintf(stderr, "\n");
1634 }
1635 break;
1636 }
1637 case nir_op_usub_borrow: {
1638 Temp src0 = get_alu_src(ctx, instr->src[0]);
1639 Temp src1 = get_alu_src(ctx, instr->src[1]);
1640 if (dst.regClass() == s1) {
1641 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1642 break;
1643 } else if (dst.regClass() == v1) {
1644 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1645 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1646 break;
1647 }
1648
1649 Temp src00 = bld.tmp(src0.type(), 1);
1650 Temp src01 = bld.tmp(dst.type(), 1);
1651 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1652 Temp src10 = bld.tmp(src1.type(), 1);
1653 Temp src11 = bld.tmp(dst.type(), 1);
1654 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1655 if (dst.regClass() == s2) {
1656 Temp borrow = bld.tmp(s1);
1657 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1658 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1659 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1660 } else if (dst.regClass() == v2) {
1661 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1662 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1663 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1664 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1665 } else {
1666 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1667 nir_print_instr(&instr->instr, stderr);
1668 fprintf(stderr, "\n");
1669 }
1670 break;
1671 }
1672 case nir_op_imul: {
1673 if (dst.regClass() == v1) {
1674 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1675 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1676 } else if (dst.regClass() == s1) {
1677 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1678 } else {
1679 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1680 nir_print_instr(&instr->instr, stderr);
1681 fprintf(stderr, "\n");
1682 }
1683 break;
1684 }
1685 case nir_op_umul_high: {
1686 if (dst.regClass() == v1) {
1687 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1688 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1689 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1690 } else if (dst.regClass() == s1) {
1691 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1692 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1693 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1694 } else {
1695 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1696 nir_print_instr(&instr->instr, stderr);
1697 fprintf(stderr, "\n");
1698 }
1699 break;
1700 }
1701 case nir_op_imul_high: {
1702 if (dst.regClass() == v1) {
1703 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1704 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1705 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1706 } else if (dst.regClass() == s1) {
1707 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1708 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1709 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1710 } else {
1711 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1712 nir_print_instr(&instr->instr, stderr);
1713 fprintf(stderr, "\n");
1714 }
1715 break;
1716 }
1717 case nir_op_fmul: {
1718 Temp src0 = get_alu_src(ctx, instr->src[0]);
1719 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1720 if (dst.regClass() == v2b) {
1721 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
1722 } else if (dst.regClass() == v1) {
1723 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1724 } else if (dst.regClass() == v2) {
1725 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
1726 } else {
1727 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1728 nir_print_instr(&instr->instr, stderr);
1729 fprintf(stderr, "\n");
1730 }
1731 break;
1732 }
1733 case nir_op_fadd: {
1734 Temp src0 = get_alu_src(ctx, instr->src[0]);
1735 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1736 if (dst.regClass() == v2b) {
1737 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
1738 } else if (dst.regClass() == v1) {
1739 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1740 } else if (dst.regClass() == v2) {
1741 bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
1742 } else {
1743 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1744 nir_print_instr(&instr->instr, stderr);
1745 fprintf(stderr, "\n");
1746 }
1747 break;
1748 }
1749 case nir_op_fsub: {
1750 Temp src0 = get_alu_src(ctx, instr->src[0]);
1751 Temp src1 = get_alu_src(ctx, instr->src[1]);
1752 if (dst.regClass() == v2b) {
1753 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1754 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
1755 else
1756 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
1757 } else if (dst.regClass() == v1) {
1758 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1759 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1760 else
1761 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1762 } else if (dst.regClass() == v2) {
1763 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1764 as_vgpr(ctx, src0), as_vgpr(ctx, src1));
1765 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1766 sub->neg[1] = true;
1767 } else {
1768 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1769 nir_print_instr(&instr->instr, stderr);
1770 fprintf(stderr, "\n");
1771 }
1772 break;
1773 }
1774 case nir_op_fmax: {
1775 Temp src0 = get_alu_src(ctx, instr->src[0]);
1776 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1777 if (dst.regClass() == v2b) {
1778 // TODO: check fp_mode.must_flush_denorms16_64
1779 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
1780 } else if (dst.regClass() == v1) {
1781 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1782 } else if (dst.regClass() == v2) {
1783 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1784 Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
1785 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1786 } else {
1787 bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
1788 }
1789 } else {
1790 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1791 nir_print_instr(&instr->instr, stderr);
1792 fprintf(stderr, "\n");
1793 }
1794 break;
1795 }
1796 case nir_op_fmin: {
1797 Temp src0 = get_alu_src(ctx, instr->src[0]);
1798 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1799 if (dst.regClass() == v2b) {
1800 // TODO: check fp_mode.must_flush_denorms16_64
1801 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
1802 } else if (dst.regClass() == v1) {
1803 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1804 } else if (dst.regClass() == v2) {
1805 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1806 Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
1807 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1808 } else {
1809 bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
1810 }
1811 } else {
1812 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1813 nir_print_instr(&instr->instr, stderr);
1814 fprintf(stderr, "\n");
1815 }
1816 break;
1817 }
1818 case nir_op_fmax3: {
1819 if (dst.regClass() == v2b) {
1820 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
1821 } else if (dst.regClass() == v1) {
1822 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1823 } else {
1824 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1825 nir_print_instr(&instr->instr, stderr);
1826 fprintf(stderr, "\n");
1827 }
1828 break;
1829 }
1830 case nir_op_fmin3: {
1831 if (dst.regClass() == v2b) {
1832 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
1833 } else if (dst.regClass() == v1) {
1834 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1835 } else {
1836 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1837 nir_print_instr(&instr->instr, stderr);
1838 fprintf(stderr, "\n");
1839 }
1840 break;
1841 }
1842 case nir_op_fmed3: {
1843 if (dst.regClass() == v2b) {
1844 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
1845 } else if (dst.regClass() == v1) {
1846 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1847 } else {
1848 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1849 nir_print_instr(&instr->instr, stderr);
1850 fprintf(stderr, "\n");
1851 }
1852 break;
1853 }
1854 case nir_op_umax3: {
1855 if (dst.size() == 1) {
1856 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1857 } else {
1858 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1859 nir_print_instr(&instr->instr, stderr);
1860 fprintf(stderr, "\n");
1861 }
1862 break;
1863 }
1864 case nir_op_umin3: {
1865 if (dst.size() == 1) {
1866 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1867 } else {
1868 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1869 nir_print_instr(&instr->instr, stderr);
1870 fprintf(stderr, "\n");
1871 }
1872 break;
1873 }
1874 case nir_op_umed3: {
1875 if (dst.size() == 1) {
1876 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1877 } else {
1878 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1879 nir_print_instr(&instr->instr, stderr);
1880 fprintf(stderr, "\n");
1881 }
1882 break;
1883 }
1884 case nir_op_imax3: {
1885 if (dst.size() == 1) {
1886 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1887 } else {
1888 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1889 nir_print_instr(&instr->instr, stderr);
1890 fprintf(stderr, "\n");
1891 }
1892 break;
1893 }
1894 case nir_op_imin3: {
1895 if (dst.size() == 1) {
1896 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1897 } else {
1898 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1899 nir_print_instr(&instr->instr, stderr);
1900 fprintf(stderr, "\n");
1901 }
1902 break;
1903 }
1904 case nir_op_imed3: {
1905 if (dst.size() == 1) {
1906 emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1907 } else {
1908 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1909 nir_print_instr(&instr->instr, stderr);
1910 fprintf(stderr, "\n");
1911 }
1912 break;
1913 }
1914 case nir_op_cube_face_coord: {
1915 Temp in = get_alu_src(ctx, instr->src[0], 3);
1916 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1917 emit_extract_vector(ctx, in, 1, v1),
1918 emit_extract_vector(ctx, in, 2, v1) };
1919 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1920 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1921 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1922 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1923 sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1924 tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1925 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1926 break;
1927 }
1928 case nir_op_cube_face_index: {
1929 Temp in = get_alu_src(ctx, instr->src[0], 3);
1930 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1931 emit_extract_vector(ctx, in, 1, v1),
1932 emit_extract_vector(ctx, in, 2, v1) };
1933 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1934 break;
1935 }
1936 case nir_op_bcsel: {
1937 emit_bcsel(ctx, instr, dst);
1938 break;
1939 }
1940 case nir_op_frsq: {
1941 Temp src = get_alu_src(ctx, instr->src[0]);
1942 if (dst.regClass() == v2b) {
1943 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
1944 } else if (dst.regClass() == v1) {
1945 emit_rsq(ctx, bld, Definition(dst), src);
1946 } else if (dst.regClass() == v2) {
1947 /* Lowered at NIR level for precision reasons. */
1948 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1949 } else {
1950 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1951 nir_print_instr(&instr->instr, stderr);
1952 fprintf(stderr, "\n");
1953 }
1954 break;
1955 }
1956 case nir_op_fneg: {
1957 Temp src = get_alu_src(ctx, instr->src[0]);
1958 if (dst.regClass() == v2b) {
1959 if (ctx->block->fp_mode.must_flush_denorms16_64)
1960 src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1961 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
1962 } else if (dst.regClass() == v1) {
1963 if (ctx->block->fp_mode.must_flush_denorms32)
1964 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1965 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1966 } else if (dst.regClass() == v2) {
1967 if (ctx->block->fp_mode.must_flush_denorms16_64)
1968 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1969 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1970 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1971 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1972 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1973 } else {
1974 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1975 nir_print_instr(&instr->instr, stderr);
1976 fprintf(stderr, "\n");
1977 }
1978 break;
1979 }
1980 case nir_op_fabs: {
1981 Temp src = get_alu_src(ctx, instr->src[0]);
1982 if (dst.regClass() == v2b) {
1983 if (ctx->block->fp_mode.must_flush_denorms16_64)
1984 src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1985 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
1986 } else if (dst.regClass() == v1) {
1987 if (ctx->block->fp_mode.must_flush_denorms32)
1988 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1989 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1990 } else if (dst.regClass() == v2) {
1991 if (ctx->block->fp_mode.must_flush_denorms16_64)
1992 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1993 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1994 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1995 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1996 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1997 } else {
1998 fprintf(stderr, "Unimplemented NIR instr bit size: ");
1999 nir_print_instr(&instr->instr, stderr);
2000 fprintf(stderr, "\n");
2001 }
2002 break;
2003 }
2004 case nir_op_fsat: {
2005 Temp src = get_alu_src(ctx, instr->src[0]);
2006 if (dst.regClass() == v2b) {
2007 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
2008 } else if (dst.regClass() == v1) {
2009 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2010 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
2011 // TODO: confirm that this holds under any circumstances
2012 } else if (dst.regClass() == v2) {
2013 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
2014 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
2015 vop3->clamp = true;
2016 } else {
2017 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2018 nir_print_instr(&instr->instr, stderr);
2019 fprintf(stderr, "\n");
2020 }
2021 break;
2022 }
2023 case nir_op_flog2: {
2024 Temp src = get_alu_src(ctx, instr->src[0]);
2025 if (dst.regClass() == v2b) {
2026 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2027 } else if (dst.regClass() == v1) {
2028 emit_log2(ctx, bld, Definition(dst), src);
2029 } else {
2030 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2031 nir_print_instr(&instr->instr, stderr);
2032 fprintf(stderr, "\n");
2033 }
2034 break;
2035 }
2036 case nir_op_frcp: {
2037 Temp src = get_alu_src(ctx, instr->src[0]);
2038 if (dst.regClass() == v2b) {
2039 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2040 } else if (dst.regClass() == v1) {
2041 emit_rcp(ctx, bld, Definition(dst), src);
2042 } else if (dst.regClass() == v2) {
2043 /* Lowered at NIR level for precision reasons. */
2044 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2045 } else {
2046 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2047 nir_print_instr(&instr->instr, stderr);
2048 fprintf(stderr, "\n");
2049 }
2050 break;
2051 }
2052 case nir_op_fexp2: {
2053 if (dst.regClass() == v2b) {
2054 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2055 } else if (dst.regClass() == v1) {
2056 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2057 } else {
2058 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2059 nir_print_instr(&instr->instr, stderr);
2060 fprintf(stderr, "\n");
2061 }
2062 break;
2063 }
2064 case nir_op_fsqrt: {
2065 Temp src = get_alu_src(ctx, instr->src[0]);
2066 if (dst.regClass() == v2b) {
2067 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2068 } else if (dst.regClass() == v1) {
2069 emit_sqrt(ctx, bld, Definition(dst), src);
2070 } else if (dst.regClass() == v2) {
2071 /* Lowered at NIR level for precision reasons. */
2072 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2073 } else {
2074 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2075 nir_print_instr(&instr->instr, stderr);
2076 fprintf(stderr, "\n");
2077 }
2078 break;
2079 }
2080 case nir_op_ffract: {
2081 if (dst.regClass() == v2b) {
2082 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2083 } else if (dst.regClass() == v1) {
2084 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2085 } else if (dst.regClass() == v2) {
2086 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2087 } else {
2088 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2089 nir_print_instr(&instr->instr, stderr);
2090 fprintf(stderr, "\n");
2091 }
2092 break;
2093 }
2094 case nir_op_ffloor: {
2095 Temp src = get_alu_src(ctx, instr->src[0]);
2096 if (dst.regClass() == v2b) {
2097 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2098 } else if (dst.regClass() == v1) {
2099 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2100 } else if (dst.regClass() == v2) {
2101 emit_floor_f64(ctx, bld, Definition(dst), src);
2102 } else {
2103 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2104 nir_print_instr(&instr->instr, stderr);
2105 fprintf(stderr, "\n");
2106 }
2107 break;
2108 }
2109 case nir_op_fceil: {
2110 Temp src0 = get_alu_src(ctx, instr->src[0]);
2111 if (dst.regClass() == v2b) {
2112 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2113 } else if (dst.regClass() == v1) {
2114 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2115 } else if (dst.regClass() == v2) {
2116 if (ctx->options->chip_class >= GFX7) {
2117 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2118 } else {
2119 /* GFX6 doesn't support V_CEIL_F64, lower it. */
2120 /* trunc = trunc(src0)
2121 * if (src0 > 0.0 && src0 != trunc)
2122 * trunc += 1.0
2123 */
2124 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2125 Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
2126 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2127 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
2128 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
2129 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
2130 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2131 }
2132 } else {
2133 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2134 nir_print_instr(&instr->instr, stderr);
2135 fprintf(stderr, "\n");
2136 }
2137 break;
2138 }
2139 case nir_op_ftrunc: {
2140 Temp src = get_alu_src(ctx, instr->src[0]);
2141 if (dst.regClass() == v2b) {
2142 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2143 } else if (dst.regClass() == v1) {
2144 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2145 } else if (dst.regClass() == v2) {
2146 emit_trunc_f64(ctx, bld, Definition(dst), src);
2147 } else {
2148 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2149 nir_print_instr(&instr->instr, stderr);
2150 fprintf(stderr, "\n");
2151 }
2152 break;
2153 }
2154 case nir_op_fround_even: {
2155 Temp src0 = get_alu_src(ctx, instr->src[0]);
2156 if (dst.regClass() == v2b) {
2157 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2158 } else if (dst.regClass() == v1) {
2159 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2160 } else if (dst.regClass() == v2) {
2161 if (ctx->options->chip_class >= GFX7) {
2162 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2163 } else {
2164 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2165 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2166 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2167
2168 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
2169 Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
2170 Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2171 Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2172 static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
2173 tmp = sub->definitions[0].getTemp();
2174
2175 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
2176 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2177 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2178 Temp cond = vop3->definitions[0].getTemp();
2179
2180 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2181 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2182 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
2183 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
2184
2185 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2186 }
2187 } else {
2188 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2189 nir_print_instr(&instr->instr, stderr);
2190 fprintf(stderr, "\n");
2191 }
2192 break;
2193 }
2194 case nir_op_fsin:
2195 case nir_op_fcos: {
2196 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2197 aco_ptr<Instruction> norm;
2198 if (dst.regClass() == v2b) {
2199 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
2200 Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2201 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2202 bld.vop1(opcode, Definition(dst), tmp);
2203 } else if (dst.regClass() == v1) {
2204 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
2205 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2206
2207 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2208 if (ctx->options->chip_class < GFX9)
2209 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2210
2211 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2212 bld.vop1(opcode, Definition(dst), tmp);
2213 } else {
2214 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2215 nir_print_instr(&instr->instr, stderr);
2216 fprintf(stderr, "\n");
2217 }
2218 break;
2219 }
2220 case nir_op_ldexp: {
2221 Temp src0 = get_alu_src(ctx, instr->src[0]);
2222 Temp src1 = get_alu_src(ctx, instr->src[1]);
2223 if (dst.regClass() == v2b) {
2224 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2225 } else if (dst.regClass() == v1) {
2226 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
2227 } else if (dst.regClass() == v2) {
2228 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
2229 } else {
2230 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2231 nir_print_instr(&instr->instr, stderr);
2232 fprintf(stderr, "\n");
2233 }
2234 break;
2235 }
2236 case nir_op_frexp_sig: {
2237 Temp src = get_alu_src(ctx, instr->src[0]);
2238 if (dst.regClass() == v2b) {
2239 bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
2240 } else if (dst.regClass() == v1) {
2241 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
2242 } else if (dst.regClass() == v2) {
2243 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
2244 } else {
2245 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2246 nir_print_instr(&instr->instr, stderr);
2247 fprintf(stderr, "\n");
2248 }
2249 break;
2250 }
2251 case nir_op_frexp_exp: {
2252 Temp src = get_alu_src(ctx, instr->src[0]);
2253 if (instr->src[0].src.ssa->bit_size == 16) {
2254 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2255 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
2256 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2257 } else if (instr->src[0].src.ssa->bit_size == 32) {
2258 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
2259 } else if (instr->src[0].src.ssa->bit_size == 64) {
2260 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
2261 } else {
2262 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2263 nir_print_instr(&instr->instr, stderr);
2264 fprintf(stderr, "\n");
2265 }
2266 break;
2267 }
2268 case nir_op_fsign: {
2269 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2270 if (dst.regClass() == v2b) {
2271 Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2272 Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
2273 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2274 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
2275 cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2276 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
2277 } else if (dst.regClass() == v1) {
2278 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2279 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
2280 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2281 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
2282 } else if (dst.regClass() == v2) {
2283 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2284 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
2285 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
2286
2287 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2288 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
2289 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2290
2291 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2292 } else {
2293 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2294 nir_print_instr(&instr->instr, stderr);
2295 fprintf(stderr, "\n");
2296 }
2297 break;
2298 }
2299 case nir_op_f2f16:
2300 case nir_op_f2f16_rtne: {
2301 Temp src = get_alu_src(ctx, instr->src[0]);
2302 if (instr->src[0].src.ssa->bit_size == 64)
2303 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2304 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2305 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2306 * keep value numbering and the scheduler simpler.
2307 */
2308 bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2309 else
2310 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2311 break;
2312 }
2313 case nir_op_f2f16_rtz: {
2314 Temp src = get_alu_src(ctx, instr->src[0]);
2315 if (instr->src[0].src.ssa->bit_size == 64)
2316 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2317 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
2318 break;
2319 }
2320 case nir_op_f2f32: {
2321 if (instr->src[0].src.ssa->bit_size == 16) {
2322 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2323 } else if (instr->src[0].src.ssa->bit_size == 64) {
2324 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2325 } else {
2326 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2327 nir_print_instr(&instr->instr, stderr);
2328 fprintf(stderr, "\n");
2329 }
2330 break;
2331 }
2332 case nir_op_f2f64: {
2333 Temp src = get_alu_src(ctx, instr->src[0]);
2334 if (instr->src[0].src.ssa->bit_size == 16)
2335 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2336 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2337 break;
2338 }
2339 case nir_op_i2f16: {
2340 assert(dst.regClass() == v2b);
2341 Temp src = get_alu_src(ctx, instr->src[0]);
2342 if (instr->src[0].src.ssa->bit_size == 8)
2343 src = convert_int(ctx, bld, src, 8, 16, true);
2344 else if (instr->src[0].src.ssa->bit_size == 64)
2345 src = convert_int(ctx, bld, src, 64, 32, false);
2346 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2347 break;
2348 }
2349 case nir_op_i2f32: {
2350 assert(dst.size() == 1);
2351 Temp src = get_alu_src(ctx, instr->src[0]);
2352 if (instr->src[0].src.ssa->bit_size <= 16)
2353 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2354 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2355 break;
2356 }
2357 case nir_op_i2f64: {
2358 if (instr->src[0].src.ssa->bit_size <= 32) {
2359 Temp src = get_alu_src(ctx, instr->src[0]);
2360 if (instr->src[0].src.ssa->bit_size <= 16)
2361 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2362 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2363 } else if (instr->src[0].src.ssa->bit_size == 64) {
2364 Temp src = get_alu_src(ctx, instr->src[0]);
2365 RegClass rc = RegClass(src.type(), 1);
2366 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2367 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2368 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2369 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2370 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2371 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2372
2373 } else {
2374 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2375 nir_print_instr(&instr->instr, stderr);
2376 fprintf(stderr, "\n");
2377 }
2378 break;
2379 }
2380 case nir_op_u2f16: {
2381 assert(dst.regClass() == v2b);
2382 Temp src = get_alu_src(ctx, instr->src[0]);
2383 if (instr->src[0].src.ssa->bit_size == 8)
2384 src = convert_int(ctx, bld, src, 8, 16, false);
2385 else if (instr->src[0].src.ssa->bit_size == 64)
2386 src = convert_int(ctx, bld, src, 64, 32, false);
2387 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2388 break;
2389 }
2390 case nir_op_u2f32: {
2391 assert(dst.size() == 1);
2392 Temp src = get_alu_src(ctx, instr->src[0]);
2393 if (instr->src[0].src.ssa->bit_size == 8) {
2394 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2395 } else {
2396 if (instr->src[0].src.ssa->bit_size == 16)
2397 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2398 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2399 }
2400 break;
2401 }
2402 case nir_op_u2f64: {
2403 if (instr->src[0].src.ssa->bit_size <= 32) {
2404 Temp src = get_alu_src(ctx, instr->src[0]);
2405 if (instr->src[0].src.ssa->bit_size <= 16)
2406 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2407 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2408 } else if (instr->src[0].src.ssa->bit_size == 64) {
2409 Temp src = get_alu_src(ctx, instr->src[0]);
2410 RegClass rc = RegClass(src.type(), 1);
2411 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2412 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2413 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2414 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2415 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2416 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2417 } else {
2418 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2419 nir_print_instr(&instr->instr, stderr);
2420 fprintf(stderr, "\n");
2421 }
2422 break;
2423 }
2424 case nir_op_f2i8:
2425 case nir_op_f2i16: {
2426 if (instr->src[0].src.ssa->bit_size == 16)
2427 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2428 else if (instr->src[0].src.ssa->bit_size == 32)
2429 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2430 else
2431 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2432 break;
2433 }
2434 case nir_op_f2u8:
2435 case nir_op_f2u16: {
2436 if (instr->src[0].src.ssa->bit_size == 16)
2437 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2438 else if (instr->src[0].src.ssa->bit_size == 32)
2439 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2440 else
2441 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2442 break;
2443 }
2444 case nir_op_f2i32: {
2445 Temp src = get_alu_src(ctx, instr->src[0]);
2446 if (instr->src[0].src.ssa->bit_size == 16) {
2447 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2448 if (dst.type() == RegType::vgpr) {
2449 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2450 } else {
2451 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2452 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2453 }
2454 } else if (instr->src[0].src.ssa->bit_size == 32) {
2455 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2456 } else if (instr->src[0].src.ssa->bit_size == 64) {
2457 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2458 } else {
2459 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2460 nir_print_instr(&instr->instr, stderr);
2461 fprintf(stderr, "\n");
2462 }
2463 break;
2464 }
2465 case nir_op_f2u32: {
2466 Temp src = get_alu_src(ctx, instr->src[0]);
2467 if (instr->src[0].src.ssa->bit_size == 16) {
2468 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2469 if (dst.type() == RegType::vgpr) {
2470 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2471 } else {
2472 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2473 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2474 }
2475 } else if (instr->src[0].src.ssa->bit_size == 32) {
2476 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2477 } else if (instr->src[0].src.ssa->bit_size == 64) {
2478 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2479 } else {
2480 fprintf(stderr, "Unimplemented NIR instr bit size: ");
2481 nir_print_instr(&instr->instr, stderr);
2482 fprintf(stderr, "\n");
2483 }
2484 break;
2485 }
2486 case nir_op_f2i64: {
2487 Temp src = get_alu_src(ctx, instr->src[0]);
2488 if (instr->src[0].src.ssa->bit_size == 16)
2489 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2490
2491 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2492 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2493 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
2494 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2495 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2496 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2497 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
2498 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2499 Temp new_exponent = bld.tmp(v1);
2500 Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
2501 if (ctx->program->chip_class >= GFX8)
2502 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2503 else
2504 mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2505 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
2506 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2507 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2508 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
2509 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2510 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2511 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2512 Temp new_lower = bld.tmp(v1);
2513 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2514 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2515 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2516
2517 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2518 if (src.type() == RegType::vgpr)
2519 src = bld.as_uniform(src);
2520 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2521 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2522 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2523 exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
2524 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2525 Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2526 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2527 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
2528 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2529 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
2530 mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2531 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
2532 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
2533 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2534 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2535 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2536 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2537 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2538 Temp borrow = bld.tmp(s1);
2539 lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2540 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
2541 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2542
2543 } else if (instr->src[0