aco: fix sgpr ubfe/ibfe if the offset is too large
[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include <algorithm>
27 #include <array>
28 #include <stack>
29 #include <map>
30
31 #include "ac_shader_util.h"
32 #include "aco_ir.h"
33 #include "aco_builder.h"
34 #include "aco_interface.h"
35 #include "aco_instruction_selection_setup.cpp"
36 #include "util/fast_idiv_by_const.h"
37
38 namespace aco {
39 namespace {
40
41 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
42
43 static void _isel_err(isel_context *ctx, const char *file, unsigned line,
44 const nir_instr *instr, const char *msg)
45 {
46 char *out;
47 size_t outsize;
48 FILE *memf = open_memstream(&out, &outsize);
49
50 fprintf(memf, "%s: ", msg);
51 nir_print_instr(instr, memf);
52 fclose(memf);
53
54 _aco_err(ctx->program, file, line, out);
55 free(out);
56 }
57
58 class loop_info_RAII {
59 isel_context* ctx;
60 unsigned header_idx_old;
61 Block* exit_old;
62 bool divergent_cont_old;
63 bool divergent_branch_old;
64 bool divergent_if_old;
65
66 public:
67 loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
68 : ctx(ctx),
69 header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
70 divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
71 divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
72 divergent_if_old(ctx->cf_info.parent_if.is_divergent)
73 {
74 ctx->cf_info.parent_loop.header_idx = loop_header_idx;
75 ctx->cf_info.parent_loop.exit = loop_exit;
76 ctx->cf_info.parent_loop.has_divergent_continue = false;
77 ctx->cf_info.parent_loop.has_divergent_branch = false;
78 ctx->cf_info.parent_if.is_divergent = false;
79 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
80 }
81
82 ~loop_info_RAII()
83 {
84 ctx->cf_info.parent_loop.header_idx = header_idx_old;
85 ctx->cf_info.parent_loop.exit = exit_old;
86 ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
87 ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
88 ctx->cf_info.parent_if.is_divergent = divergent_if_old;
89 ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
90 if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
91 ctx->cf_info.exec_potentially_empty_discard = false;
92 }
93 };
94
95 struct if_context {
96 Temp cond;
97
98 bool divergent_old;
99 bool exec_potentially_empty_discard_old;
100 bool exec_potentially_empty_break_old;
101 uint16_t exec_potentially_empty_break_depth_old;
102
103 unsigned BB_if_idx;
104 unsigned invert_idx;
105 bool uniform_has_then_branch;
106 bool then_branch_divergent;
107 Block BB_invert;
108 Block BB_endif;
109 };
110
111 static bool visit_cf_list(struct isel_context *ctx,
112 struct exec_list *list);
113
114 static void add_logical_edge(unsigned pred_idx, Block *succ)
115 {
116 succ->logical_preds.emplace_back(pred_idx);
117 }
118
119
120 static void add_linear_edge(unsigned pred_idx, Block *succ)
121 {
122 succ->linear_preds.emplace_back(pred_idx);
123 }
124
125 static void add_edge(unsigned pred_idx, Block *succ)
126 {
127 add_logical_edge(pred_idx, succ);
128 add_linear_edge(pred_idx, succ);
129 }
130
131 static void append_logical_start(Block *b)
132 {
133 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
134 }
135
136 static void append_logical_end(Block *b)
137 {
138 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
139 }
140
141 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
142 {
143 assert(ctx->allocated[def->index].id());
144 return ctx->allocated[def->index];
145 }
146
147 Temp emit_mbcnt(isel_context *ctx, Definition dst,
148 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
149 {
150 Builder bld(ctx->program, ctx->block);
151 Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
152 Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
153
154 if (ctx->program->wave_size == 32) {
155 return thread_id_lo;
156 } else if (ctx->program->chip_class <= GFX7) {
157 Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
158 return thread_id_hi;
159 } else {
160 Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
161 return thread_id_hi;
162 }
163 }
164
165 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
166 {
167 Builder bld(ctx->program, ctx->block);
168
169 if (!dst.id())
170 dst = bld.tmp(src.regClass());
171
172 assert(src.size() == dst.size());
173
174 if (ctx->stage != fragment_fs) {
175 if (!dst.id())
176 return src;
177
178 bld.copy(Definition(dst), src);
179 return dst;
180 }
181
182 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
183 ctx->program->needs_wqm |= program_needs_wqm;
184 return dst;
185 }
186
187 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
188 {
189 if (index.regClass() == s1)
190 return bld.readlane(bld.def(s1), data, index);
191
192 if (ctx->options->chip_class <= GFX7) {
193 /* GFX6-7: there is no bpermute instruction */
194 Operand index_op(index);
195 Operand input_data(data);
196 index_op.setLateKill(true);
197 input_data.setLateKill(true);
198
199 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
200 } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
201 /* GFX10 wave64 mode: emulate full-wave bpermute */
202 if (!ctx->has_gfx10_wave64_bpermute) {
203 ctx->has_gfx10_wave64_bpermute = true;
204 ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
205 ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
206 }
207
208 Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
209 Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
210 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
211 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
212 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
213 Operand input_data(data);
214
215 index_x4.setLateKill(true);
216 input_data.setLateKill(true);
217 same_half.setLateKill(true);
218
219 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
220 } else {
221 /* GFX8-9 or GFX10 wave32: bpermute works normally */
222 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
223 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
224 }
225 }
226
227 static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
228 {
229 if (ctx->options->chip_class >= GFX8) {
230 unsigned and_mask = mask & 0x1f;
231 unsigned or_mask = (mask >> 5) & 0x1f;
232 unsigned xor_mask = (mask >> 10) & 0x1f;
233
234 uint16_t dpp_ctrl = 0xffff;
235
236 // TODO: we could use DPP8 for some swizzles
237 if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
238 unsigned res[4] = {0, 1, 2, 3};
239 for (unsigned i = 0; i < 4; i++)
240 res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
241 dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
242 } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
243 dpp_ctrl = dpp_row_rr(8);
244 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
245 dpp_ctrl = dpp_row_mirror;
246 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
247 dpp_ctrl = dpp_row_half_mirror;
248 }
249
250 if (dpp_ctrl != 0xffff)
251 return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
252 }
253
254 return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
255 }
256
257 Temp as_vgpr(isel_context *ctx, Temp val)
258 {
259 if (val.type() == RegType::sgpr) {
260 Builder bld(ctx->program, ctx->block);
261 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
262 }
263 assert(val.type() == RegType::vgpr);
264 return val;
265 }
266
267 //assumes a != 0xffffffff
268 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
269 {
270 assert(b != 0);
271 Builder bld(ctx->program, ctx->block);
272
273 if (util_is_power_of_two_or_zero(b)) {
274 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
275 return;
276 }
277
278 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
279
280 assert(info.multiplier <= 0xffffffff);
281
282 bool pre_shift = info.pre_shift != 0;
283 bool increment = info.increment != 0;
284 bool multiply = true;
285 bool post_shift = info.post_shift != 0;
286
287 if (!pre_shift && !increment && !multiply && !post_shift) {
288 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
289 return;
290 }
291
292 Temp pre_shift_dst = a;
293 if (pre_shift) {
294 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
295 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
296 }
297
298 Temp increment_dst = pre_shift_dst;
299 if (increment) {
300 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
301 bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
302 }
303
304 Temp multiply_dst = increment_dst;
305 if (multiply) {
306 multiply_dst = post_shift ? bld.tmp(v1) : dst;
307 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
308 bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
309 }
310
311 if (post_shift) {
312 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
313 }
314 }
315
316 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
317 {
318 Builder bld(ctx->program, ctx->block);
319 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
320 }
321
322
323 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
324 {
325 /* no need to extract the whole vector */
326 if (src.regClass() == dst_rc) {
327 assert(idx == 0);
328 return src;
329 }
330
331 assert(src.bytes() > (idx * dst_rc.bytes()));
332 Builder bld(ctx->program, ctx->block);
333 auto it = ctx->allocated_vec.find(src.id());
334 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
335 if (it->second[idx].regClass() == dst_rc) {
336 return it->second[idx];
337 } else {
338 assert(!dst_rc.is_subdword());
339 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
340 return bld.copy(bld.def(dst_rc), it->second[idx]);
341 }
342 }
343
344 if (dst_rc.is_subdword())
345 src = as_vgpr(ctx, src);
346
347 if (src.bytes() == dst_rc.bytes()) {
348 assert(idx == 0);
349 return bld.copy(bld.def(dst_rc), src);
350 } else {
351 Temp dst = bld.tmp(dst_rc);
352 emit_extract_vector(ctx, src, idx, dst);
353 return dst;
354 }
355 }
356
357 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
358 {
359 if (num_components == 1)
360 return;
361 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
362 return;
363 RegClass rc;
364 if (num_components > vec_src.size()) {
365 if (vec_src.type() == RegType::sgpr) {
366 /* should still help get_alu_src() */
367 emit_split_vector(ctx, vec_src, vec_src.size());
368 return;
369 }
370 /* sub-dword split */
371 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
372 } else {
373 rc = RegClass(vec_src.type(), vec_src.size() / num_components);
374 }
375 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
376 split->operands[0] = Operand(vec_src);
377 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
378 for (unsigned i = 0; i < num_components; i++) {
379 elems[i] = {ctx->program->allocateId(), rc};
380 split->definitions[i] = Definition(elems[i]);
381 }
382 ctx->block->instructions.emplace_back(std::move(split));
383 ctx->allocated_vec.emplace(vec_src.id(), elems);
384 }
385
386 /* This vector expansion uses a mask to determine which elements in the new vector
387 * come from the original vector. The other elements are undefined. */
388 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
389 {
390 emit_split_vector(ctx, vec_src, util_bitcount(mask));
391
392 if (vec_src == dst)
393 return;
394
395 Builder bld(ctx->program, ctx->block);
396 if (num_components == 1) {
397 if (dst.type() == RegType::sgpr)
398 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
399 else
400 bld.copy(Definition(dst), vec_src);
401 return;
402 }
403
404 unsigned component_size = dst.size() / num_components;
405 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
406
407 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
408 vec->definitions[0] = Definition(dst);
409 unsigned k = 0;
410 for (unsigned i = 0; i < num_components; i++) {
411 if (mask & (1 << i)) {
412 Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
413 if (dst.type() == RegType::sgpr)
414 src = bld.as_uniform(src);
415 vec->operands[i] = Operand(src);
416 } else {
417 vec->operands[i] = Operand(0u);
418 }
419 elems[i] = vec->operands[i].getTemp();
420 }
421 ctx->block->instructions.emplace_back(std::move(vec));
422 ctx->allocated_vec.emplace(dst.id(), elems);
423 }
424
425 /* adjust misaligned small bit size loads */
426 void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
427 {
428 Builder bld(ctx->program, ctx->block);
429 Operand shift;
430 Temp select = Temp();
431 if (offset.isConstant()) {
432 assert(offset.constantValue() && offset.constantValue() < 4);
433 shift = Operand(offset.constantValue() * 8);
434 } else {
435 /* bit_offset = 8 * (offset & 0x3) */
436 Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
437 select = bld.tmp(s1);
438 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
439 }
440
441 if (vec.size() == 1) {
442 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
443 } else if (vec.size() == 2) {
444 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
445 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
446 if (tmp == dst)
447 emit_split_vector(ctx, dst, 2);
448 else
449 emit_extract_vector(ctx, tmp, 0, dst);
450 } else if (vec.size() == 4) {
451 Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
452 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
453 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
454 if (select != Temp())
455 hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
456 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
457 Temp mid = bld.tmp(s1);
458 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
459 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
460 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
461 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
462 emit_split_vector(ctx, dst, 2);
463 }
464 }
465
466 void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
467 {
468 Builder bld(ctx->program, ctx->block);
469 if (offset.isTemp()) {
470 Temp tmp[4] = {vec, vec, vec, vec};
471
472 if (vec.size() == 4) {
473 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
474 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
475 } else if (vec.size() == 3) {
476 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
477 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
478 } else if (vec.size() == 2) {
479 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
480 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
481 }
482 for (unsigned i = 0; i < dst.size(); i++)
483 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
484
485 vec = tmp[0];
486 if (dst.size() == 2)
487 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
488
489 offset = Operand(0u);
490 }
491
492 unsigned num_components = vec.bytes() / component_size;
493 if (vec.regClass() == dst.regClass()) {
494 assert(offset.constantValue() == 0);
495 bld.copy(Definition(dst), vec);
496 emit_split_vector(ctx, dst, num_components);
497 return;
498 }
499
500 emit_split_vector(ctx, vec, num_components);
501 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
502 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
503
504 assert(offset.constantValue() % component_size == 0);
505 unsigned skip = offset.constantValue() / component_size;
506 for (unsigned i = skip; i < num_components; i++)
507 elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
508
509 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
510 if (dst.type() == RegType::vgpr) {
511 num_components = dst.bytes() / component_size;
512 aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
513 for (unsigned i = 0; i < num_components; i++)
514 create_vec->operands[i] = Operand(elems[i]);
515 create_vec->definitions[0] = Definition(dst);
516 bld.insert(std::move(create_vec));
517
518 /* if dst is sgpr - split the src, but move the original to sgpr. */
519 } else if (skip) {
520 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
521 byte_align_scalar(ctx, vec, offset, dst);
522 } else {
523 assert(dst.size() == vec.size());
524 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
525 }
526
527 ctx->allocated_vec.emplace(dst.id(), elems);
528 }
529
530 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
531 {
532 Builder bld(ctx->program, ctx->block);
533 if (!dst.id())
534 dst = bld.tmp(bld.lm);
535
536 assert(val.regClass() == s1);
537 assert(dst.regClass() == bld.lm);
538
539 return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
540 }
541
542 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
543 {
544 Builder bld(ctx->program, ctx->block);
545 if (!dst.id())
546 dst = bld.tmp(s1);
547
548 assert(val.regClass() == bld.lm);
549 assert(dst.regClass() == s1);
550
551 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
552 Temp tmp = bld.tmp(s1);
553 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
554 return emit_wqm(ctx, tmp, dst);
555 }
556
557 Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp())
558 {
559 if (!dst.id()) {
560 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
561 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
562 else
563 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
564 }
565
566 if (dst.bytes() == src.bytes() && dst_bits < src_bits)
567 return bld.copy(Definition(dst), src);
568 else if (dst.bytes() < src.bytes())
569 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
570
571 Temp tmp = dst;
572 if (dst_bits == 64)
573 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
574
575 if (tmp == src) {
576 } else if (src.regClass() == s1) {
577 if (is_signed)
578 bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
579 else
580 bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
581 } else if (ctx->options->chip_class >= GFX8) {
582 assert(src_bits != 8 || src.regClass() == v1b);
583 assert(src_bits != 16 || src.regClass() == v2b);
584 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
585 sdwa->operands[0] = Operand(src);
586 sdwa->definitions[0] = Definition(tmp);
587 if (is_signed)
588 sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
589 else
590 sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
591 sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
592 bld.insert(std::move(sdwa));
593 } else {
594 assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
595 aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
596 bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
597 }
598
599 if (dst_bits == 64) {
600 if (is_signed && dst.regClass() == s2) {
601 Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
602 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
603 } else if (is_signed && dst.regClass() == v2) {
604 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
605 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
606 } else {
607 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
608 }
609 }
610
611 return dst;
612 }
613
614 enum sgpr_extract_mode {
615 sgpr_extract_sext,
616 sgpr_extract_zext,
617 sgpr_extract_undef,
618 };
619
620 Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src, sgpr_extract_mode mode)
621 {
622 Temp vec = get_ssa_temp(ctx, src->src.ssa);
623 unsigned src_size = src->src.ssa->bit_size;
624 unsigned swizzle = src->swizzle[0];
625
626 if (vec.size() > 1) {
627 assert(src_size == 16);
628 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
629 swizzle = swizzle & 1;
630 }
631
632 Builder bld(ctx->program, ctx->block);
633 unsigned offset = src_size * swizzle;
634 Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
635
636 if (mode == sgpr_extract_undef && swizzle == 0) {
637 bld.copy(Definition(tmp), vec);
638 } else if (mode == sgpr_extract_undef || (offset == 24 && mode == sgpr_extract_zext)) {
639 bld.sop2(aco_opcode::s_lshr_b32, Definition(tmp), bld.def(s1, scc), vec, Operand(offset));
640 } else if (src_size == 8 && swizzle == 0 && mode == sgpr_extract_sext) {
641 bld.sop1(aco_opcode::s_sext_i32_i8, Definition(tmp), vec);
642 } else if (src_size == 16 && swizzle == 0 && mode == sgpr_extract_sext) {
643 bld.sop1(aco_opcode::s_sext_i32_i16, Definition(tmp), vec);
644 } else {
645 aco_opcode op = mode == sgpr_extract_zext ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
646 bld.sop2(op, Definition(tmp), bld.def(s1, scc), vec, Operand((src_size << 16) | offset));
647 }
648
649 if (dst.regClass() == s2)
650 convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
651
652 return dst;
653 }
654
655 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
656 {
657 if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
658 return get_ssa_temp(ctx, src.src.ssa);
659
660 if (src.src.ssa->num_components == size) {
661 bool identity_swizzle = true;
662 for (unsigned i = 0; identity_swizzle && i < size; i++) {
663 if (src.swizzle[i] != i)
664 identity_swizzle = false;
665 }
666 if (identity_swizzle)
667 return get_ssa_temp(ctx, src.src.ssa);
668 }
669
670 Temp vec = get_ssa_temp(ctx, src.src.ssa);
671 unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
672 assert(elem_size > 0);
673 assert(vec.bytes() % elem_size == 0);
674
675 if (elem_size < 4 && vec.type() == RegType::sgpr) {
676 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
677 assert(size == 1);
678 return extract_8_16_bit_sgpr_element(
679 ctx, Temp(ctx->program->allocateId(), s1), &src, sgpr_extract_undef);
680 }
681
682 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
683 if (size == 1) {
684 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
685 } else {
686 assert(size <= 4);
687 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
688 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
689 for (unsigned i = 0; i < size; ++i) {
690 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
691 vec_instr->operands[i] = Operand{elems[i]};
692 }
693 Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
694 vec_instr->definitions[0] = Definition(dst);
695 ctx->block->instructions.emplace_back(std::move(vec_instr));
696 ctx->allocated_vec.emplace(dst.id(), elems);
697 return dst;
698 }
699 }
700
701 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
702 {
703 if (ptr.size() == 2)
704 return ptr;
705 Builder bld(ctx->program, ctx->block);
706 if (ptr.type() == RegType::vgpr)
707 ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
708 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
709 ptr, Operand((unsigned)ctx->options->address32_hi));
710 }
711
712 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
713 {
714 aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
715 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
716 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
717 sop2->definitions[0] = Definition(dst);
718 if (instr->no_unsigned_wrap)
719 sop2->definitions[0].setNUW(true);
720 if (writes_scc)
721 sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
722 ctx->block->instructions.emplace_back(std::move(sop2));
723 }
724
725 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
726 bool commutative, bool swap_srcs=false, bool flush_denorms = false)
727 {
728 Builder bld(ctx->program, ctx->block);
729 bld.is_precise = instr->exact;
730
731 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
732 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
733 if (src1.type() == RegType::sgpr) {
734 if (commutative && src0.type() == RegType::vgpr) {
735 Temp t = src0;
736 src0 = src1;
737 src1 = t;
738 } else {
739 src1 = as_vgpr(ctx, src1);
740 }
741 }
742
743 if (flush_denorms && ctx->program->chip_class < GFX9) {
744 assert(dst.size() == 1);
745 Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
746 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
747 } else {
748 bld.vop2(op, Definition(dst), src0, src1);
749 }
750 }
751
752 void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
753 aco_opcode op, Temp dst)
754 {
755 Builder bld(ctx->program, ctx->block);
756 bld.is_precise = instr->exact;
757
758 Temp src0 = get_alu_src(ctx, instr->src[0]);
759 Temp src1 = get_alu_src(ctx, instr->src[1]);
760
761 if (src1.type() == RegType::sgpr) {
762 assert(src0.type() == RegType::vgpr);
763 std::swap(src0, src1);
764 }
765
766 Temp src00 = bld.tmp(src0.type(), 1);
767 Temp src01 = bld.tmp(src0.type(), 1);
768 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
769 Temp src10 = bld.tmp(v1);
770 Temp src11 = bld.tmp(v1);
771 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
772 Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
773 Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
774 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
775 }
776
777 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
778 bool flush_denorms = false)
779 {
780 Temp src0 = get_alu_src(ctx, instr->src[0]);
781 Temp src1 = get_alu_src(ctx, instr->src[1]);
782 Temp src2 = get_alu_src(ctx, instr->src[2]);
783
784 /* ensure that the instruction has at most 1 sgpr operand
785 * The optimizer will inline constants for us */
786 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
787 src0 = as_vgpr(ctx, src0);
788 if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
789 src1 = as_vgpr(ctx, src1);
790 if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
791 src2 = as_vgpr(ctx, src2);
792
793 Builder bld(ctx->program, ctx->block);
794 bld.is_precise = instr->exact;
795 if (flush_denorms && ctx->program->chip_class < GFX9) {
796 assert(dst.size() == 1);
797 Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
798 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
799 } else {
800 bld.vop3(op, Definition(dst), src0, src1, src2);
801 }
802 }
803
804 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
805 {
806 Builder bld(ctx->program, ctx->block);
807 bld.is_precise = instr->exact;
808 if (dst.type() == RegType::sgpr)
809 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
810 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
811 else
812 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
813 }
814
815 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
816 {
817 Temp src0 = get_alu_src(ctx, instr->src[0]);
818 Temp src1 = get_alu_src(ctx, instr->src[1]);
819 assert(src0.size() == src1.size());
820
821 aco_ptr<Instruction> vopc;
822 if (src1.type() == RegType::sgpr) {
823 if (src0.type() == RegType::vgpr) {
824 /* to swap the operands, we might also have to change the opcode */
825 switch (op) {
826 case aco_opcode::v_cmp_lt_f16:
827 op = aco_opcode::v_cmp_gt_f16;
828 break;
829 case aco_opcode::v_cmp_ge_f16:
830 op = aco_opcode::v_cmp_le_f16;
831 break;
832 case aco_opcode::v_cmp_lt_i16:
833 op = aco_opcode::v_cmp_gt_i16;
834 break;
835 case aco_opcode::v_cmp_ge_i16:
836 op = aco_opcode::v_cmp_le_i16;
837 break;
838 case aco_opcode::v_cmp_lt_u16:
839 op = aco_opcode::v_cmp_gt_u16;
840 break;
841 case aco_opcode::v_cmp_ge_u16:
842 op = aco_opcode::v_cmp_le_u16;
843 break;
844 case aco_opcode::v_cmp_lt_f32:
845 op = aco_opcode::v_cmp_gt_f32;
846 break;
847 case aco_opcode::v_cmp_ge_f32:
848 op = aco_opcode::v_cmp_le_f32;
849 break;
850 case aco_opcode::v_cmp_lt_i32:
851 op = aco_opcode::v_cmp_gt_i32;
852 break;
853 case aco_opcode::v_cmp_ge_i32:
854 op = aco_opcode::v_cmp_le_i32;
855 break;
856 case aco_opcode::v_cmp_lt_u32:
857 op = aco_opcode::v_cmp_gt_u32;
858 break;
859 case aco_opcode::v_cmp_ge_u32:
860 op = aco_opcode::v_cmp_le_u32;
861 break;
862 case aco_opcode::v_cmp_lt_f64:
863 op = aco_opcode::v_cmp_gt_f64;
864 break;
865 case aco_opcode::v_cmp_ge_f64:
866 op = aco_opcode::v_cmp_le_f64;
867 break;
868 case aco_opcode::v_cmp_lt_i64:
869 op = aco_opcode::v_cmp_gt_i64;
870 break;
871 case aco_opcode::v_cmp_ge_i64:
872 op = aco_opcode::v_cmp_le_i64;
873 break;
874 case aco_opcode::v_cmp_lt_u64:
875 op = aco_opcode::v_cmp_gt_u64;
876 break;
877 case aco_opcode::v_cmp_ge_u64:
878 op = aco_opcode::v_cmp_le_u64;
879 break;
880 default: /* eq and ne are commutative */
881 break;
882 }
883 Temp t = src0;
884 src0 = src1;
885 src1 = t;
886 } else {
887 src1 = as_vgpr(ctx, src1);
888 }
889 }
890
891 Builder bld(ctx->program, ctx->block);
892 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
893 }
894
895 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
896 {
897 Temp src0 = get_alu_src(ctx, instr->src[0]);
898 Temp src1 = get_alu_src(ctx, instr->src[1]);
899 Builder bld(ctx->program, ctx->block);
900
901 assert(dst.regClass() == bld.lm);
902 assert(src0.type() == RegType::sgpr);
903 assert(src1.type() == RegType::sgpr);
904 assert(src0.regClass() == src1.regClass());
905
906 /* Emit the SALU comparison instruction */
907 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
908 /* Turn the result into a per-lane bool */
909 bool_to_vector_condition(ctx, cmp, dst);
910 }
911
912 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
913 aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
914 {
915 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
916 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
917 bool use_valu = s_op == aco_opcode::num_opcodes ||
918 nir_dest_is_divergent(instr->dest.dest) ||
919 ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
920 ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
921 aco_opcode op = use_valu ? v_op : s_op;
922 assert(op != aco_opcode::num_opcodes);
923 assert(dst.regClass() == ctx->program->lane_mask);
924
925 if (use_valu)
926 emit_vopc_instruction(ctx, instr, op, dst);
927 else
928 emit_sopc_instruction(ctx, instr, op, dst);
929 }
930
931 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
932 {
933 Builder bld(ctx->program, ctx->block);
934 Temp src0 = get_alu_src(ctx, instr->src[0]);
935 Temp src1 = get_alu_src(ctx, instr->src[1]);
936
937 assert(dst.regClass() == bld.lm);
938 assert(src0.regClass() == bld.lm);
939 assert(src1.regClass() == bld.lm);
940
941 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
942 }
943
944 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
945 {
946 Builder bld(ctx->program, ctx->block);
947 Temp cond = get_alu_src(ctx, instr->src[0]);
948 Temp then = get_alu_src(ctx, instr->src[1]);
949 Temp els = get_alu_src(ctx, instr->src[2]);
950
951 assert(cond.regClass() == bld.lm);
952
953 if (dst.type() == RegType::vgpr) {
954 aco_ptr<Instruction> bcsel;
955 if (dst.size() == 1) {
956 then = as_vgpr(ctx, then);
957 els = as_vgpr(ctx, els);
958
959 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
960 } else if (dst.size() == 2) {
961 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
962 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
963 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
964 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
965
966 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
967 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
968
969 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
970 } else {
971 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
972 }
973 return;
974 }
975
976 if (instr->dest.dest.ssa.bit_size == 1) {
977 assert(dst.regClass() == bld.lm);
978 assert(then.regClass() == bld.lm);
979 assert(els.regClass() == bld.lm);
980 }
981
982 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
983 if (dst.regClass() == s1 || dst.regClass() == s2) {
984 assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
985 assert(dst.size() == then.size());
986 aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
987 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
988 } else {
989 isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
990 }
991 return;
992 }
993
994 /* divergent boolean bcsel
995 * this implements bcsel on bools: dst = s0 ? s1 : s2
996 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
997 assert(instr->dest.dest.ssa.bit_size == 1);
998
999 if (cond.id() != then.id())
1000 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1001
1002 if (cond.id() == els.id())
1003 bld.sop1(Builder::s_mov, Definition(dst), then);
1004 else
1005 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1006 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1007 }
1008
1009 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
1010 aco_opcode op, uint32_t undo)
1011 {
1012 /* multiply by 16777216 to handle denormals */
1013 Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
1014 as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
1015 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
1016 scaled = bld.vop1(op, bld.def(v1), scaled);
1017 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
1018
1019 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1020
1021 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1022 }
1023
1024 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1025 {
1026 if (ctx->block->fp_mode.denorm32 == 0) {
1027 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1028 return;
1029 }
1030
1031 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1032 }
1033
1034 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1035 {
1036 if (ctx->block->fp_mode.denorm32 == 0) {
1037 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1038 return;
1039 }
1040
1041 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1042 }
1043
1044 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1045 {
1046 if (ctx->block->fp_mode.denorm32 == 0) {
1047 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1048 return;
1049 }
1050
1051 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1052 }
1053
1054 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1055 {
1056 if (ctx->block->fp_mode.denorm32 == 0) {
1057 bld.vop1(aco_opcode::v_log_f32, dst, val);
1058 return;
1059 }
1060
1061 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1062 }
1063
1064 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1065 {
1066 if (ctx->options->chip_class >= GFX7)
1067 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1068
1069 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1070 /* TODO: create more efficient code! */
1071 if (val.type() == RegType::sgpr)
1072 val = as_vgpr(ctx, val);
1073
1074 /* Split the input value. */
1075 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1076 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1077
1078 /* Extract the exponent and compute the unbiased value. */
1079 Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
1080 exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
1081
1082 /* Extract the fractional part. */
1083 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
1084 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1085
1086 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1087 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
1088
1089 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1090 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1091 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1092 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1093 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1094
1095 /* Get the sign bit. */
1096 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
1097
1098 /* Decide the operation to apply depending on the unbiased exponent. */
1099 Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
1100 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
1101 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1102 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
1103 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1104 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1105
1106 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1107 }
1108
1109 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1110 {
1111 if (ctx->options->chip_class >= GFX7)
1112 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1113
1114 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1115 * lowered at NIR level for precision reasons). */
1116 Temp src0 = as_vgpr(ctx, val);
1117
1118 Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
1119 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
1120
1121 Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1122 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1123 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1124
1125 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1126 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1127 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1128 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1129
1130 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1131 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1132
1133 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1134
1135 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1136 static_cast<VOP3A_instruction*>(add)->neg[1] = true;
1137
1138 return add->definitions[0].getTemp();
1139 }
1140
1141 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
1142 {
1143 if (!instr->dest.dest.is_ssa) {
1144 isel_err(&instr->instr, "nir alu dst not in ssa");
1145 abort();
1146 }
1147 Builder bld(ctx->program, ctx->block);
1148 bld.is_precise = instr->exact;
1149 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1150 switch(instr->op) {
1151 case nir_op_vec2:
1152 case nir_op_vec3:
1153 case nir_op_vec4: {
1154 std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
1155 unsigned num = instr->dest.dest.ssa.num_components;
1156 for (unsigned i = 0; i < num; ++i)
1157 elems[i] = get_alu_src(ctx, instr->src[i]);
1158
1159 if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1160 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1161 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1162 for (unsigned i = 0; i < num; ++i) {
1163 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1164 vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
1165 else
1166 vec->operands[i] = Operand{elems[i]};
1167 }
1168 vec->definitions[0] = Definition(dst);
1169 ctx->block->instructions.emplace_back(std::move(vec));
1170 ctx->allocated_vec.emplace(dst.id(), elems);
1171 } else {
1172 // TODO: that is a bit suboptimal..
1173 Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
1174 for (unsigned i = 0; i < num - 1; ++i)
1175 if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
1176 elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1177 for (unsigned i = 0; i < num; ++i) {
1178 unsigned bit = i * instr->dest.dest.ssa.bit_size;
1179 if (bit % 32 == 0) {
1180 elems[bit / 32] = elems[i];
1181 } else {
1182 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
1183 elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
1184 elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
1185 }
1186 }
1187 if (dst.size() == 1)
1188 bld.copy(Definition(dst), elems[0]);
1189 else
1190 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
1191 }
1192 break;
1193 }
1194 case nir_op_mov: {
1195 Temp src = get_alu_src(ctx, instr->src[0]);
1196 aco_ptr<Instruction> mov;
1197 if (dst.type() == RegType::sgpr) {
1198 if (src.type() == RegType::vgpr)
1199 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1200 else if (src.regClass() == s1)
1201 bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
1202 else if (src.regClass() == s2)
1203 bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
1204 else
1205 unreachable("wrong src register class for nir_op_imov");
1206 } else {
1207 if (dst.regClass() == v1)
1208 bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
1209 else if (dst.regClass() == v1b ||
1210 dst.regClass() == v2b ||
1211 dst.regClass() == v2)
1212 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
1213 else
1214 unreachable("wrong src register class for nir_op_imov");
1215 }
1216 break;
1217 }
1218 case nir_op_inot: {
1219 Temp src = get_alu_src(ctx, instr->src[0]);
1220 if (instr->dest.dest.ssa.bit_size == 1) {
1221 assert(src.regClass() == bld.lm);
1222 assert(dst.regClass() == bld.lm);
1223 /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1224 Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1225 bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1226 } else if (dst.regClass() == v1) {
1227 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1228 } else if (dst.regClass() == v2) {
1229 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1230 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1231 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1232 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1233 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1234 } else if (dst.type() == RegType::sgpr) {
1235 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1236 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1237 } else {
1238 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1239 }
1240 break;
1241 }
1242 case nir_op_ineg: {
1243 Temp src = get_alu_src(ctx, instr->src[0]);
1244 if (dst.regClass() == v1) {
1245 bld.vsub32(Definition(dst), Operand(0u), Operand(src));
1246 } else if (dst.regClass() == s1) {
1247 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
1248 } else if (dst.size() == 2) {
1249 Temp src0 = bld.tmp(dst.type(), 1);
1250 Temp src1 = bld.tmp(dst.type(), 1);
1251 bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
1252
1253 if (dst.regClass() == s2) {
1254 Temp carry = bld.tmp(s1);
1255 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
1256 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
1257 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1258 } else {
1259 Temp lower = bld.tmp(v1);
1260 Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
1261 Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
1262 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1263 }
1264 } else {
1265 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1266 }
1267 break;
1268 }
1269 case nir_op_iabs: {
1270 if (dst.regClass() == s1) {
1271 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
1272 } else if (dst.regClass() == v1) {
1273 Temp src = get_alu_src(ctx, instr->src[0]);
1274 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
1275 } else {
1276 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1277 }
1278 break;
1279 }
1280 case nir_op_isign: {
1281 Temp src = get_alu_src(ctx, instr->src[0]);
1282 if (dst.regClass() == s1) {
1283 Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
1284 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
1285 } else if (dst.regClass() == s2) {
1286 Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1287 Temp neqz;
1288 if (ctx->program->chip_class >= GFX8)
1289 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
1290 else
1291 neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
1292 /* SCC gets zero-extended to 64 bit */
1293 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1294 } else if (dst.regClass() == v1) {
1295 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
1296 } else if (dst.regClass() == v2) {
1297 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1298 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1299 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1300 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
1301 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
1302 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1303 } else {
1304 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1305 }
1306 break;
1307 }
1308 case nir_op_imax: {
1309 if (dst.regClass() == v1) {
1310 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1311 } else if (dst.regClass() == s1) {
1312 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1313 } else {
1314 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1315 }
1316 break;
1317 }
1318 case nir_op_umax: {
1319 if (dst.regClass() == v1) {
1320 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1321 } else if (dst.regClass() == s1) {
1322 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1323 } else {
1324 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1325 }
1326 break;
1327 }
1328 case nir_op_imin: {
1329 if (dst.regClass() == v1) {
1330 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1331 } else if (dst.regClass() == s1) {
1332 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1333 } else {
1334 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1335 }
1336 break;
1337 }
1338 case nir_op_umin: {
1339 if (dst.regClass() == v1) {
1340 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1341 } else if (dst.regClass() == s1) {
1342 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1343 } else {
1344 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1345 }
1346 break;
1347 }
1348 case nir_op_ior: {
1349 if (instr->dest.dest.ssa.bit_size == 1) {
1350 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1351 } else if (dst.regClass() == v1) {
1352 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1353 } else if (dst.regClass() == v2) {
1354 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1355 } else if (dst.regClass() == s1) {
1356 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1357 } else if (dst.regClass() == s2) {
1358 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1359 } else {
1360 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1361 }
1362 break;
1363 }
1364 case nir_op_iand: {
1365 if (instr->dest.dest.ssa.bit_size == 1) {
1366 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1367 } else if (dst.regClass() == v1) {
1368 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1369 } else if (dst.regClass() == v2) {
1370 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1371 } else if (dst.regClass() == s1) {
1372 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1373 } else if (dst.regClass() == s2) {
1374 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1375 } else {
1376 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1377 }
1378 break;
1379 }
1380 case nir_op_ixor: {
1381 if (instr->dest.dest.ssa.bit_size == 1) {
1382 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1383 } else if (dst.regClass() == v1) {
1384 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1385 } else if (dst.regClass() == v2) {
1386 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1387 } else if (dst.regClass() == s1) {
1388 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1389 } else if (dst.regClass() == s2) {
1390 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1391 } else {
1392 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1393 }
1394 break;
1395 }
1396 case nir_op_ushr: {
1397 if (dst.regClass() == v1) {
1398 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1399 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1400 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1401 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1402 } else if (dst.regClass() == v2) {
1403 bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1404 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1405 } else if (dst.regClass() == s2) {
1406 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1407 } else if (dst.regClass() == s1) {
1408 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1409 } else {
1410 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1411 }
1412 break;
1413 }
1414 case nir_op_ishl: {
1415 if (dst.regClass() == v1) {
1416 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1417 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1418 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1419 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1420 } else if (dst.regClass() == v2) {
1421 bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1422 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1423 } else if (dst.regClass() == s1) {
1424 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1425 } else if (dst.regClass() == s2) {
1426 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1427 } else {
1428 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1429 }
1430 break;
1431 }
1432 case nir_op_ishr: {
1433 if (dst.regClass() == v1) {
1434 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1435 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1436 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1437 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1438 } else if (dst.regClass() == v2) {
1439 bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1440 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1441 } else if (dst.regClass() == s1) {
1442 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1443 } else if (dst.regClass() == s2) {
1444 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1445 } else {
1446 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1447 }
1448 break;
1449 }
1450 case nir_op_find_lsb: {
1451 Temp src = get_alu_src(ctx, instr->src[0]);
1452 if (src.regClass() == s1) {
1453 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1454 } else if (src.regClass() == v1) {
1455 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1456 } else if (src.regClass() == s2) {
1457 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1458 } else {
1459 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1460 }
1461 break;
1462 }
1463 case nir_op_ufind_msb:
1464 case nir_op_ifind_msb: {
1465 Temp src = get_alu_src(ctx, instr->src[0]);
1466 if (src.regClass() == s1 || src.regClass() == s2) {
1467 aco_opcode op = src.regClass() == s2 ?
1468 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1469 (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1470 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1471
1472 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1473 Operand(src.size() * 32u - 1u), msb_rev);
1474 Temp msb = sub.def(0).getTemp();
1475 Temp carry = sub.def(1).getTemp();
1476
1477 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1478 } else if (src.regClass() == v1) {
1479 aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1480 Temp msb_rev = bld.tmp(v1);
1481 emit_vop1_instruction(ctx, instr, op, msb_rev);
1482 Temp msb = bld.tmp(v1);
1483 Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1484 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1485 } else {
1486 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1487 }
1488 break;
1489 }
1490 case nir_op_bitfield_reverse: {
1491 if (dst.regClass() == s1) {
1492 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1493 } else if (dst.regClass() == v1) {
1494 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1495 } else {
1496 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1497 }
1498 break;
1499 }
1500 case nir_op_iadd: {
1501 if (dst.regClass() == s1) {
1502 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1503 break;
1504 }
1505
1506 Temp src0 = get_alu_src(ctx, instr->src[0]);
1507 Temp src1 = get_alu_src(ctx, instr->src[1]);
1508 if (dst.regClass() == v1) {
1509 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1510 break;
1511 }
1512
1513 assert(src0.size() == 2 && src1.size() == 2);
1514 Temp src00 = bld.tmp(src0.type(), 1);
1515 Temp src01 = bld.tmp(dst.type(), 1);
1516 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1517 Temp src10 = bld.tmp(src1.type(), 1);
1518 Temp src11 = bld.tmp(dst.type(), 1);
1519 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1520
1521 if (dst.regClass() == s2) {
1522 Temp carry = bld.tmp(s1);
1523 Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1524 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1525 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1526 } else if (dst.regClass() == v2) {
1527 Temp dst0 = bld.tmp(v1);
1528 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1529 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1530 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1531 } else {
1532 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1533 }
1534 break;
1535 }
1536 case nir_op_uadd_sat: {
1537 Temp src0 = get_alu_src(ctx, instr->src[0]);
1538 Temp src1 = get_alu_src(ctx, instr->src[1]);
1539 if (dst.regClass() == s1) {
1540 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1541 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1542 src0, src1);
1543 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1544 } else if (dst.regClass() == v1) {
1545 if (ctx->options->chip_class >= GFX9) {
1546 aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1547 add->operands[0] = Operand(src0);
1548 add->operands[1] = Operand(src1);
1549 add->definitions[0] = Definition(dst);
1550 add->clamp = 1;
1551 ctx->block->instructions.emplace_back(std::move(add));
1552 } else {
1553 if (src1.regClass() != v1)
1554 std::swap(src0, src1);
1555 assert(src1.regClass() == v1);
1556 Temp tmp = bld.tmp(v1);
1557 Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1558 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1559 }
1560 } else {
1561 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1562 }
1563 break;
1564 }
1565 case nir_op_uadd_carry: {
1566 Temp src0 = get_alu_src(ctx, instr->src[0]);
1567 Temp src1 = get_alu_src(ctx, instr->src[1]);
1568 if (dst.regClass() == s1) {
1569 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1570 break;
1571 }
1572 if (dst.regClass() == v1) {
1573 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1574 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1575 break;
1576 }
1577
1578 Temp src00 = bld.tmp(src0.type(), 1);
1579 Temp src01 = bld.tmp(dst.type(), 1);
1580 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1581 Temp src10 = bld.tmp(src1.type(), 1);
1582 Temp src11 = bld.tmp(dst.type(), 1);
1583 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1584 if (dst.regClass() == s2) {
1585 Temp carry = bld.tmp(s1);
1586 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1587 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1588 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1589 } else if (dst.regClass() == v2) {
1590 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1591 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1592 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1593 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1594 } else {
1595 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1596 }
1597 break;
1598 }
1599 case nir_op_isub: {
1600 if (dst.regClass() == s1) {
1601 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1602 break;
1603 }
1604
1605 Temp src0 = get_alu_src(ctx, instr->src[0]);
1606 Temp src1 = get_alu_src(ctx, instr->src[1]);
1607 if (dst.regClass() == v1) {
1608 bld.vsub32(Definition(dst), src0, src1);
1609 break;
1610 }
1611
1612 Temp src00 = bld.tmp(src0.type(), 1);
1613 Temp src01 = bld.tmp(dst.type(), 1);
1614 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1615 Temp src10 = bld.tmp(src1.type(), 1);
1616 Temp src11 = bld.tmp(dst.type(), 1);
1617 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1618 if (dst.regClass() == s2) {
1619 Temp carry = bld.tmp(s1);
1620 Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1621 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1622 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1623 } else if (dst.regClass() == v2) {
1624 Temp lower = bld.tmp(v1);
1625 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1626 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1627 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1628 } else {
1629 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1630 }
1631 break;
1632 }
1633 case nir_op_usub_borrow: {
1634 Temp src0 = get_alu_src(ctx, instr->src[0]);
1635 Temp src1 = get_alu_src(ctx, instr->src[1]);
1636 if (dst.regClass() == s1) {
1637 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1638 break;
1639 } else if (dst.regClass() == v1) {
1640 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1641 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1642 break;
1643 }
1644
1645 Temp src00 = bld.tmp(src0.type(), 1);
1646 Temp src01 = bld.tmp(dst.type(), 1);
1647 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1648 Temp src10 = bld.tmp(src1.type(), 1);
1649 Temp src11 = bld.tmp(dst.type(), 1);
1650 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1651 if (dst.regClass() == s2) {
1652 Temp borrow = bld.tmp(s1);
1653 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1654 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1655 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1656 } else if (dst.regClass() == v2) {
1657 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1658 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1659 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1660 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1661 } else {
1662 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1663 }
1664 break;
1665 }
1666 case nir_op_imul: {
1667 if (dst.regClass() == v1) {
1668 bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1669 get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1670 } else if (dst.regClass() == s1) {
1671 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1672 } else {
1673 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1674 }
1675 break;
1676 }
1677 case nir_op_umul_high: {
1678 if (dst.regClass() == v1) {
1679 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1680 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1681 bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1682 } else if (dst.regClass() == s1) {
1683 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1684 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1685 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1686 } else {
1687 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1688 }
1689 break;
1690 }
1691 case nir_op_imul_high: {
1692 if (dst.regClass() == v1) {
1693 bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1694 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1695 bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1696 } else if (dst.regClass() == s1) {
1697 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1698 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1699 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1700 } else {
1701 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1702 }
1703 break;
1704 }
1705 case nir_op_fmul: {
1706 Temp src0 = get_alu_src(ctx, instr->src[0]);
1707 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1708 if (dst.regClass() == v2b) {
1709 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
1710 } else if (dst.regClass() == v1) {
1711 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1712 } else if (dst.regClass() == v2) {
1713 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
1714 } else {
1715 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1716 }
1717 break;
1718 }
1719 case nir_op_fadd: {
1720 Temp src0 = get_alu_src(ctx, instr->src[0]);
1721 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1722 if (dst.regClass() == v2b) {
1723 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
1724 } else if (dst.regClass() == v1) {
1725 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1726 } else if (dst.regClass() == v2) {
1727 bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
1728 } else {
1729 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1730 }
1731 break;
1732 }
1733 case nir_op_fsub: {
1734 Temp src0 = get_alu_src(ctx, instr->src[0]);
1735 Temp src1 = get_alu_src(ctx, instr->src[1]);
1736 if (dst.regClass() == v2b) {
1737 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1738 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
1739 else
1740 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
1741 } else if (dst.regClass() == v1) {
1742 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1743 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1744 else
1745 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1746 } else if (dst.regClass() == v2) {
1747 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1748 as_vgpr(ctx, src0), as_vgpr(ctx, src1));
1749 VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1750 sub->neg[1] = true;
1751 } else {
1752 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1753 }
1754 break;
1755 }
1756 case nir_op_fmax: {
1757 Temp src0 = get_alu_src(ctx, instr->src[0]);
1758 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1759 if (dst.regClass() == v2b) {
1760 // TODO: check fp_mode.must_flush_denorms16_64
1761 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
1762 } else if (dst.regClass() == v1) {
1763 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1764 } else if (dst.regClass() == v2) {
1765 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1766 Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
1767 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1768 } else {
1769 bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
1770 }
1771 } else {
1772 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1773 }
1774 break;
1775 }
1776 case nir_op_fmin: {
1777 Temp src0 = get_alu_src(ctx, instr->src[0]);
1778 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1779 if (dst.regClass() == v2b) {
1780 // TODO: check fp_mode.must_flush_denorms16_64
1781 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
1782 } else if (dst.regClass() == v1) {
1783 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1784 } else if (dst.regClass() == v2) {
1785 if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1786 Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
1787 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1788 } else {
1789 bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
1790 }
1791 } else {
1792 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1793 }
1794 break;
1795 }
1796 case nir_op_cube_face_coord: {
1797 Temp in = get_alu_src(ctx, instr->src[0], 3);
1798 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1799 emit_extract_vector(ctx, in, 1, v1),
1800 emit_extract_vector(ctx, in, 2, v1) };
1801 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1802 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1803 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1804 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1805 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1),
1806 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma), Operand(0x3f000000u/*0.5*/));
1807 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1),
1808 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma), Operand(0x3f000000u/*0.5*/));
1809 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1810 break;
1811 }
1812 case nir_op_cube_face_index: {
1813 Temp in = get_alu_src(ctx, instr->src[0], 3);
1814 Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1815 emit_extract_vector(ctx, in, 1, v1),
1816 emit_extract_vector(ctx, in, 2, v1) };
1817 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1818 break;
1819 }
1820 case nir_op_bcsel: {
1821 emit_bcsel(ctx, instr, dst);
1822 break;
1823 }
1824 case nir_op_frsq: {
1825 Temp src = get_alu_src(ctx, instr->src[0]);
1826 if (dst.regClass() == v2b) {
1827 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
1828 } else if (dst.regClass() == v1) {
1829 emit_rsq(ctx, bld, Definition(dst), src);
1830 } else if (dst.regClass() == v2) {
1831 /* Lowered at NIR level for precision reasons. */
1832 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1833 } else {
1834 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1835 }
1836 break;
1837 }
1838 case nir_op_fneg: {
1839 Temp src = get_alu_src(ctx, instr->src[0]);
1840 if (dst.regClass() == v2b) {
1841 if (ctx->block->fp_mode.must_flush_denorms16_64)
1842 src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1843 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
1844 } else if (dst.regClass() == v1) {
1845 if (ctx->block->fp_mode.must_flush_denorms32)
1846 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1847 bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1848 } else if (dst.regClass() == v2) {
1849 if (ctx->block->fp_mode.must_flush_denorms16_64)
1850 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1851 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1852 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1853 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1854 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1855 } else {
1856 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1857 }
1858 break;
1859 }
1860 case nir_op_fabs: {
1861 Temp src = get_alu_src(ctx, instr->src[0]);
1862 if (dst.regClass() == v2b) {
1863 if (ctx->block->fp_mode.must_flush_denorms16_64)
1864 src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1865 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
1866 } else if (dst.regClass() == v1) {
1867 if (ctx->block->fp_mode.must_flush_denorms32)
1868 src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1869 bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1870 } else if (dst.regClass() == v2) {
1871 if (ctx->block->fp_mode.must_flush_denorms16_64)
1872 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1873 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1874 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1875 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1876 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1877 } else {
1878 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1879 }
1880 break;
1881 }
1882 case nir_op_fsat: {
1883 Temp src = get_alu_src(ctx, instr->src[0]);
1884 if (dst.regClass() == v2b) {
1885 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
1886 } else if (dst.regClass() == v1) {
1887 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1888 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
1889 // TODO: confirm that this holds under any circumstances
1890 } else if (dst.regClass() == v2) {
1891 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1892 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1893 vop3->clamp = true;
1894 } else {
1895 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1896 }
1897 break;
1898 }
1899 case nir_op_flog2: {
1900 Temp src = get_alu_src(ctx, instr->src[0]);
1901 if (dst.regClass() == v2b) {
1902 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
1903 } else if (dst.regClass() == v1) {
1904 emit_log2(ctx, bld, Definition(dst), src);
1905 } else {
1906 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1907 }
1908 break;
1909 }
1910 case nir_op_frcp: {
1911 Temp src = get_alu_src(ctx, instr->src[0]);
1912 if (dst.regClass() == v2b) {
1913 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
1914 } else if (dst.regClass() == v1) {
1915 emit_rcp(ctx, bld, Definition(dst), src);
1916 } else if (dst.regClass() == v2) {
1917 /* Lowered at NIR level for precision reasons. */
1918 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1919 } else {
1920 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1921 }
1922 break;
1923 }
1924 case nir_op_fexp2: {
1925 if (dst.regClass() == v2b) {
1926 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
1927 } else if (dst.regClass() == v1) {
1928 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1929 } else {
1930 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1931 }
1932 break;
1933 }
1934 case nir_op_fsqrt: {
1935 Temp src = get_alu_src(ctx, instr->src[0]);
1936 if (dst.regClass() == v2b) {
1937 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
1938 } else if (dst.regClass() == v1) {
1939 emit_sqrt(ctx, bld, Definition(dst), src);
1940 } else if (dst.regClass() == v2) {
1941 /* Lowered at NIR level for precision reasons. */
1942 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1943 } else {
1944 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1945 }
1946 break;
1947 }
1948 case nir_op_ffract: {
1949 if (dst.regClass() == v2b) {
1950 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
1951 } else if (dst.regClass() == v1) {
1952 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1953 } else if (dst.regClass() == v2) {
1954 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1955 } else {
1956 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1957 }
1958 break;
1959 }
1960 case nir_op_ffloor: {
1961 Temp src = get_alu_src(ctx, instr->src[0]);
1962 if (dst.regClass() == v2b) {
1963 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
1964 } else if (dst.regClass() == v1) {
1965 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1966 } else if (dst.regClass() == v2) {
1967 emit_floor_f64(ctx, bld, Definition(dst), src);
1968 } else {
1969 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1970 }
1971 break;
1972 }
1973 case nir_op_fceil: {
1974 Temp src0 = get_alu_src(ctx, instr->src[0]);
1975 if (dst.regClass() == v2b) {
1976 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
1977 } else if (dst.regClass() == v1) {
1978 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1979 } else if (dst.regClass() == v2) {
1980 if (ctx->options->chip_class >= GFX7) {
1981 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1982 } else {
1983 /* GFX6 doesn't support V_CEIL_F64, lower it. */
1984 /* trunc = trunc(src0)
1985 * if (src0 > 0.0 && src0 != trunc)
1986 * trunc += 1.0
1987 */
1988 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
1989 Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
1990 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
1991 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
1992 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
1993 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
1994 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
1995 }
1996 } else {
1997 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1998 }
1999 break;
2000 }
2001 case nir_op_ftrunc: {
2002 Temp src = get_alu_src(ctx, instr->src[0]);
2003 if (dst.regClass() == v2b) {
2004 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2005 } else if (dst.regClass() == v1) {
2006 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2007 } else if (dst.regClass() == v2) {
2008 emit_trunc_f64(ctx, bld, Definition(dst), src);
2009 } else {
2010 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2011 }
2012 break;
2013 }
2014 case nir_op_fround_even: {
2015 Temp src0 = get_alu_src(ctx, instr->src[0]);
2016 if (dst.regClass() == v2b) {
2017 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2018 } else if (dst.regClass() == v1) {
2019 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2020 } else if (dst.regClass() == v2) {
2021 if (ctx->options->chip_class >= GFX7) {
2022 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2023 } else {
2024 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2025 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2026 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2027
2028 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
2029 Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
2030 Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2031 Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2032 static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
2033 tmp = sub->definitions[0].getTemp();
2034
2035 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
2036 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2037 static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2038 Temp cond = vop3->definitions[0].getTemp();
2039
2040 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2041 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2042 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
2043 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
2044
2045 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2046 }
2047 } else {
2048 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2049 }
2050 break;
2051 }
2052 case nir_op_fsin:
2053 case nir_op_fcos: {
2054 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2055 aco_ptr<Instruction> norm;
2056 if (dst.regClass() == v2b) {
2057 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
2058 Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2059 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2060 bld.vop1(opcode, Definition(dst), tmp);
2061 } else if (dst.regClass() == v1) {
2062 Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
2063 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2064
2065 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2066 if (ctx->options->chip_class < GFX9)
2067 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2068
2069 aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2070 bld.vop1(opcode, Definition(dst), tmp);
2071 } else {
2072 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2073 }
2074 break;
2075 }
2076 case nir_op_ldexp: {
2077 Temp src0 = get_alu_src(ctx, instr->src[0]);
2078 Temp src1 = get_alu_src(ctx, instr->src[1]);
2079 if (dst.regClass() == v2b) {
2080 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2081 } else if (dst.regClass() == v1) {
2082 bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
2083 } else if (dst.regClass() == v2) {
2084 bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
2085 } else {
2086 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2087 }
2088 break;
2089 }
2090 case nir_op_frexp_sig: {
2091 Temp src = get_alu_src(ctx, instr->src[0]);
2092 if (dst.regClass() == v2b) {
2093 bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
2094 } else if (dst.regClass() == v1) {
2095 bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
2096 } else if (dst.regClass() == v2) {
2097 bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
2098 } else {
2099 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2100 }
2101 break;
2102 }
2103 case nir_op_frexp_exp: {
2104 Temp src = get_alu_src(ctx, instr->src[0]);
2105 if (instr->src[0].src.ssa->bit_size == 16) {
2106 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2107 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
2108 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2109 } else if (instr->src[0].src.ssa->bit_size == 32) {
2110 bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
2111 } else if (instr->src[0].src.ssa->bit_size == 64) {
2112 bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
2113 } else {
2114 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2115 }
2116 break;
2117 }
2118 case nir_op_fsign: {
2119 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2120 if (dst.regClass() == v2b) {
2121 Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2122 Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
2123 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2124 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
2125 cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2126 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
2127 } else if (dst.regClass() == v1) {
2128 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2129 src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
2130 cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2131 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
2132 } else if (dst.regClass() == v2) {
2133 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2134 Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
2135 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
2136
2137 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2138 tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
2139 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2140
2141 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2142 } else {
2143 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2144 }
2145 break;
2146 }
2147 case nir_op_f2f16:
2148 case nir_op_f2f16_rtne: {
2149 Temp src = get_alu_src(ctx, instr->src[0]);
2150 if (instr->src[0].src.ssa->bit_size == 64)
2151 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2152 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2153 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2154 * keep value numbering and the scheduler simpler.
2155 */
2156 bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2157 else
2158 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2159 break;
2160 }
2161 case nir_op_f2f16_rtz: {
2162 Temp src = get_alu_src(ctx, instr->src[0]);
2163 if (instr->src[0].src.ssa->bit_size == 64)
2164 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2165 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
2166 break;
2167 }
2168 case nir_op_f2f32: {
2169 if (instr->src[0].src.ssa->bit_size == 16) {
2170 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2171 } else if (instr->src[0].src.ssa->bit_size == 64) {
2172 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2173 } else {
2174 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2175 }
2176 break;
2177 }
2178 case nir_op_f2f64: {
2179 Temp src = get_alu_src(ctx, instr->src[0]);
2180 if (instr->src[0].src.ssa->bit_size == 16)
2181 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2182 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2183 break;
2184 }
2185 case nir_op_i2f16: {
2186 assert(dst.regClass() == v2b);
2187 Temp src = get_alu_src(ctx, instr->src[0]);
2188 if (instr->src[0].src.ssa->bit_size == 8)
2189 src = convert_int(ctx, bld, src, 8, 16, true);
2190 else if (instr->src[0].src.ssa->bit_size == 64)
2191 src = convert_int(ctx, bld, src, 64, 32, false);
2192 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2193 break;
2194 }
2195 case nir_op_i2f32: {
2196 assert(dst.size() == 1);
2197 Temp src = get_alu_src(ctx, instr->src[0]);
2198 if (instr->src[0].src.ssa->bit_size <= 16)
2199 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2200 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2201 break;
2202 }
2203 case nir_op_i2f64: {
2204 if (instr->src[0].src.ssa->bit_size <= 32) {
2205 Temp src = get_alu_src(ctx, instr->src[0]);
2206 if (instr->src[0].src.ssa->bit_size <= 16)
2207 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2208 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2209 } else if (instr->src[0].src.ssa->bit_size == 64) {
2210 Temp src = get_alu_src(ctx, instr->src[0]);
2211 RegClass rc = RegClass(src.type(), 1);
2212 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2213 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2214 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2215 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2216 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2217 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2218
2219 } else {
2220 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2221 }
2222 break;
2223 }
2224 case nir_op_u2f16: {
2225 assert(dst.regClass() == v2b);
2226 Temp src = get_alu_src(ctx, instr->src[0]);
2227 if (instr->src[0].src.ssa->bit_size == 8)
2228 src = convert_int(ctx, bld, src, 8, 16, false);
2229 else if (instr->src[0].src.ssa->bit_size == 64)
2230 src = convert_int(ctx, bld, src, 64, 32, false);
2231 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2232 break;
2233 }
2234 case nir_op_u2f32: {
2235 assert(dst.size() == 1);
2236 Temp src = get_alu_src(ctx, instr->src[0]);
2237 if (instr->src[0].src.ssa->bit_size == 8) {
2238 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2239 } else {
2240 if (instr->src[0].src.ssa->bit_size == 16)
2241 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2242 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2243 }
2244 break;
2245 }
2246 case nir_op_u2f64: {
2247 if (instr->src[0].src.ssa->bit_size <= 32) {
2248 Temp src = get_alu_src(ctx, instr->src[0]);
2249 if (instr->src[0].src.ssa->bit_size <= 16)
2250 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2251 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2252 } else if (instr->src[0].src.ssa->bit_size == 64) {
2253 Temp src = get_alu_src(ctx, instr->src[0]);
2254 RegClass rc = RegClass(src.type(), 1);
2255 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2256 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2257 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2258 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2259 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2260 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2261 } else {
2262 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2263 }
2264 break;
2265 }
2266 case nir_op_f2i8:
2267 case nir_op_f2i16: {
2268 if (instr->src[0].src.ssa->bit_size == 16)
2269 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2270 else if (instr->src[0].src.ssa->bit_size == 32)
2271 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2272 else
2273 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2274 break;
2275 }
2276 case nir_op_f2u8:
2277 case nir_op_f2u16: {
2278 if (instr->src[0].src.ssa->bit_size == 16)
2279 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2280 else if (instr->src[0].src.ssa->bit_size == 32)
2281 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2282 else
2283 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2284 break;
2285 }
2286 case nir_op_f2i32: {
2287 Temp src = get_alu_src(ctx, instr->src[0]);
2288 if (instr->src[0].src.ssa->bit_size == 16) {
2289 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2290 if (dst.type() == RegType::vgpr) {
2291 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2292 } else {
2293 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2294 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2295 }
2296 } else if (instr->src[0].src.ssa->bit_size == 32) {
2297 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2298 } else if (instr->src[0].src.ssa->bit_size == 64) {
2299 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2300 } else {
2301 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2302 }
2303 break;
2304 }
2305 case nir_op_f2u32: {
2306 Temp src = get_alu_src(ctx, instr->src[0]);
2307 if (instr->src[0].src.ssa->bit_size == 16) {
2308 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2309 if (dst.type() == RegType::vgpr) {
2310 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2311 } else {
2312 bld.pseudo(