2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 #include "ac_shader_util.h"
33 #include "aco_builder.h"
34 #include "aco_interface.h"
35 #include "aco_instruction_selection_setup.cpp"
36 #include "util/fast_idiv_by_const.h"
41 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
43 static void _isel_err(isel_context
*ctx
, const char *file
, unsigned line
,
44 const nir_instr
*instr
, const char *msg
)
48 FILE *memf
= open_memstream(&out
, &outsize
);
50 fprintf(memf
, "%s: ", msg
);
51 nir_print_instr(instr
, memf
);
54 _aco_err(ctx
->program
, file
, line
, out
);
58 class loop_info_RAII
{
60 unsigned header_idx_old
;
62 bool divergent_cont_old
;
63 bool divergent_branch_old
;
64 bool divergent_if_old
;
67 loop_info_RAII(isel_context
* ctx
, unsigned loop_header_idx
, Block
* loop_exit
)
69 header_idx_old(ctx
->cf_info
.parent_loop
.header_idx
), exit_old(ctx
->cf_info
.parent_loop
.exit
),
70 divergent_cont_old(ctx
->cf_info
.parent_loop
.has_divergent_continue
),
71 divergent_branch_old(ctx
->cf_info
.parent_loop
.has_divergent_branch
),
72 divergent_if_old(ctx
->cf_info
.parent_if
.is_divergent
)
74 ctx
->cf_info
.parent_loop
.header_idx
= loop_header_idx
;
75 ctx
->cf_info
.parent_loop
.exit
= loop_exit
;
76 ctx
->cf_info
.parent_loop
.has_divergent_continue
= false;
77 ctx
->cf_info
.parent_loop
.has_divergent_branch
= false;
78 ctx
->cf_info
.parent_if
.is_divergent
= false;
79 ctx
->cf_info
.loop_nest_depth
= ctx
->cf_info
.loop_nest_depth
+ 1;
84 ctx
->cf_info
.parent_loop
.header_idx
= header_idx_old
;
85 ctx
->cf_info
.parent_loop
.exit
= exit_old
;
86 ctx
->cf_info
.parent_loop
.has_divergent_continue
= divergent_cont_old
;
87 ctx
->cf_info
.parent_loop
.has_divergent_branch
= divergent_branch_old
;
88 ctx
->cf_info
.parent_if
.is_divergent
= divergent_if_old
;
89 ctx
->cf_info
.loop_nest_depth
= ctx
->cf_info
.loop_nest_depth
- 1;
90 if (!ctx
->cf_info
.loop_nest_depth
&& !ctx
->cf_info
.parent_if
.is_divergent
)
91 ctx
->cf_info
.exec_potentially_empty_discard
= false;
99 bool exec_potentially_empty_discard_old
;
100 bool exec_potentially_empty_break_old
;
101 uint16_t exec_potentially_empty_break_depth_old
;
105 bool uniform_has_then_branch
;
106 bool then_branch_divergent
;
111 static bool visit_cf_list(struct isel_context
*ctx
,
112 struct exec_list
*list
);
114 static void add_logical_edge(unsigned pred_idx
, Block
*succ
)
116 succ
->logical_preds
.emplace_back(pred_idx
);
120 static void add_linear_edge(unsigned pred_idx
, Block
*succ
)
122 succ
->linear_preds
.emplace_back(pred_idx
);
125 static void add_edge(unsigned pred_idx
, Block
*succ
)
127 add_logical_edge(pred_idx
, succ
);
128 add_linear_edge(pred_idx
, succ
);
131 static void append_logical_start(Block
*b
)
133 Builder(NULL
, b
).pseudo(aco_opcode::p_logical_start
);
136 static void append_logical_end(Block
*b
)
138 Builder(NULL
, b
).pseudo(aco_opcode::p_logical_end
);
141 Temp
get_ssa_temp(struct isel_context
*ctx
, nir_ssa_def
*def
)
143 assert(ctx
->allocated
[def
->index
].id());
144 return ctx
->allocated
[def
->index
];
147 Temp
emit_mbcnt(isel_context
*ctx
, Definition dst
,
148 Operand mask_lo
= Operand((uint32_t) -1), Operand mask_hi
= Operand((uint32_t) -1))
150 Builder
bld(ctx
->program
, ctx
->block
);
151 Definition lo_def
= ctx
->program
->wave_size
== 32 ? dst
: bld
.def(v1
);
152 Temp thread_id_lo
= bld
.vop3(aco_opcode::v_mbcnt_lo_u32_b32
, lo_def
, mask_lo
, Operand(0u));
154 if (ctx
->program
->wave_size
== 32) {
156 } else if (ctx
->program
->chip_class
<= GFX7
) {
157 Temp thread_id_hi
= bld
.vop2(aco_opcode::v_mbcnt_hi_u32_b32
, dst
, mask_hi
, thread_id_lo
);
160 Temp thread_id_hi
= bld
.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64
, dst
, mask_hi
, thread_id_lo
);
165 Temp
emit_wqm(isel_context
*ctx
, Temp src
, Temp dst
=Temp(0, s1
), bool program_needs_wqm
= false)
167 Builder
bld(ctx
->program
, ctx
->block
);
170 dst
= bld
.tmp(src
.regClass());
172 assert(src
.size() == dst
.size());
174 if (ctx
->stage
!= fragment_fs
) {
178 bld
.copy(Definition(dst
), src
);
182 bld
.pseudo(aco_opcode::p_wqm
, Definition(dst
), src
);
183 ctx
->program
->needs_wqm
|= program_needs_wqm
;
187 static Temp
emit_bpermute(isel_context
*ctx
, Builder
&bld
, Temp index
, Temp data
)
189 if (index
.regClass() == s1
)
190 return bld
.readlane(bld
.def(s1
), data
, index
);
192 if (ctx
->options
->chip_class
<= GFX7
) {
193 /* GFX6-7: there is no bpermute instruction */
194 Operand
index_op(index
);
195 Operand
input_data(data
);
196 index_op
.setLateKill(true);
197 input_data
.setLateKill(true);
199 return bld
.pseudo(aco_opcode::p_bpermute
, bld
.def(v1
), bld
.def(bld
.lm
), bld
.def(bld
.lm
, vcc
), index_op
, input_data
);
200 } else if (ctx
->options
->chip_class
>= GFX10
&& ctx
->program
->wave_size
== 64) {
201 /* GFX10 wave64 mode: emulate full-wave bpermute */
202 if (!ctx
->has_gfx10_wave64_bpermute
) {
203 ctx
->has_gfx10_wave64_bpermute
= true;
204 ctx
->program
->config
->num_shared_vgprs
= 8; /* Shared VGPRs are allocated in groups of 8 */
205 ctx
->program
->vgpr_limit
-= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
208 Temp index_is_lo
= bld
.vopc(aco_opcode::v_cmp_ge_u32
, bld
.def(bld
.lm
), Operand(31u), index
);
209 Builder::Result index_is_lo_split
= bld
.pseudo(aco_opcode::p_split_vector
, bld
.def(s1
), bld
.def(s1
), index_is_lo
);
210 Temp index_is_lo_n1
= bld
.sop1(aco_opcode::s_not_b32
, bld
.def(s1
), bld
.def(s1
, scc
), index_is_lo_split
.def(1).getTemp());
211 Operand same_half
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(s2
), index_is_lo_split
.def(0).getTemp(), index_is_lo_n1
);
212 Operand index_x4
= bld
.vop2(aco_opcode::v_lshlrev_b32
, bld
.def(v1
), Operand(2u), index
);
213 Operand
input_data(data
);
215 index_x4
.setLateKill(true);
216 input_data
.setLateKill(true);
217 same_half
.setLateKill(true);
219 return bld
.pseudo(aco_opcode::p_bpermute
, bld
.def(v1
), bld
.def(s2
), bld
.def(s1
, scc
), index_x4
, input_data
, same_half
);
221 /* GFX8-9 or GFX10 wave32: bpermute works normally */
222 Temp index_x4
= bld
.vop2(aco_opcode::v_lshlrev_b32
, bld
.def(v1
), Operand(2u), index
);
223 return bld
.ds(aco_opcode::ds_bpermute_b32
, bld
.def(v1
), index_x4
, data
);
227 static Temp
emit_masked_swizzle(isel_context
*ctx
, Builder
&bld
, Temp src
, unsigned mask
)
229 if (ctx
->options
->chip_class
>= GFX8
) {
230 unsigned and_mask
= mask
& 0x1f;
231 unsigned or_mask
= (mask
>> 5) & 0x1f;
232 unsigned xor_mask
= (mask
>> 10) & 0x1f;
234 uint16_t dpp_ctrl
= 0xffff;
236 // TODO: we could use DPP8 for some swizzles
237 if (and_mask
== 0x1f && or_mask
< 4 && xor_mask
< 4) {
238 unsigned res
[4] = {0, 1, 2, 3};
239 for (unsigned i
= 0; i
< 4; i
++)
240 res
[i
] = ((res
[i
] | or_mask
) ^ xor_mask
) & 0x3;
241 dpp_ctrl
= dpp_quad_perm(res
[0], res
[1], res
[2], res
[3]);
242 } else if (and_mask
== 0x1f && !or_mask
&& xor_mask
== 8) {
243 dpp_ctrl
= dpp_row_rr(8);
244 } else if (and_mask
== 0x1f && !or_mask
&& xor_mask
== 0xf) {
245 dpp_ctrl
= dpp_row_mirror
;
246 } else if (and_mask
== 0x1f && !or_mask
&& xor_mask
== 0x7) {
247 dpp_ctrl
= dpp_row_half_mirror
;
250 if (dpp_ctrl
!= 0xffff)
251 return bld
.vop1_dpp(aco_opcode::v_mov_b32
, bld
.def(v1
), src
, dpp_ctrl
);
254 return bld
.ds(aco_opcode::ds_swizzle_b32
, bld
.def(v1
), src
, mask
, 0, false);
257 Temp
as_vgpr(isel_context
*ctx
, Temp val
)
259 if (val
.type() == RegType::sgpr
) {
260 Builder
bld(ctx
->program
, ctx
->block
);
261 return bld
.copy(bld
.def(RegType::vgpr
, val
.size()), val
);
263 assert(val
.type() == RegType::vgpr
);
267 //assumes a != 0xffffffff
268 void emit_v_div_u32(isel_context
*ctx
, Temp dst
, Temp a
, uint32_t b
)
271 Builder
bld(ctx
->program
, ctx
->block
);
273 if (util_is_power_of_two_or_zero(b
)) {
274 bld
.vop2(aco_opcode::v_lshrrev_b32
, Definition(dst
), Operand((uint32_t)util_logbase2(b
)), a
);
278 util_fast_udiv_info info
= util_compute_fast_udiv_info(b
, 32, 32);
280 assert(info
.multiplier
<= 0xffffffff);
282 bool pre_shift
= info
.pre_shift
!= 0;
283 bool increment
= info
.increment
!= 0;
284 bool multiply
= true;
285 bool post_shift
= info
.post_shift
!= 0;
287 if (!pre_shift
&& !increment
&& !multiply
&& !post_shift
) {
288 bld
.vop1(aco_opcode::v_mov_b32
, Definition(dst
), a
);
292 Temp pre_shift_dst
= a
;
294 pre_shift_dst
= (increment
|| multiply
|| post_shift
) ? bld
.tmp(v1
) : dst
;
295 bld
.vop2(aco_opcode::v_lshrrev_b32
, Definition(pre_shift_dst
), Operand((uint32_t)info
.pre_shift
), a
);
298 Temp increment_dst
= pre_shift_dst
;
300 increment_dst
= (post_shift
|| multiply
) ? bld
.tmp(v1
) : dst
;
301 bld
.vadd32(Definition(increment_dst
), Operand((uint32_t) info
.increment
), pre_shift_dst
);
304 Temp multiply_dst
= increment_dst
;
306 multiply_dst
= post_shift
? bld
.tmp(v1
) : dst
;
307 bld
.vop3(aco_opcode::v_mul_hi_u32
, Definition(multiply_dst
), increment_dst
,
308 bld
.vop1(aco_opcode::v_mov_b32
, bld
.def(v1
), Operand((uint32_t)info
.multiplier
)));
312 bld
.vop2(aco_opcode::v_lshrrev_b32
, Definition(dst
), Operand((uint32_t)info
.post_shift
), multiply_dst
);
316 void emit_extract_vector(isel_context
* ctx
, Temp src
, uint32_t idx
, Temp dst
)
318 Builder
bld(ctx
->program
, ctx
->block
);
319 bld
.pseudo(aco_opcode::p_extract_vector
, Definition(dst
), src
, Operand(idx
));
323 Temp
emit_extract_vector(isel_context
* ctx
, Temp src
, uint32_t idx
, RegClass dst_rc
)
325 /* no need to extract the whole vector */
326 if (src
.regClass() == dst_rc
) {
331 assert(src
.bytes() > (idx
* dst_rc
.bytes()));
332 Builder
bld(ctx
->program
, ctx
->block
);
333 auto it
= ctx
->allocated_vec
.find(src
.id());
334 if (it
!= ctx
->allocated_vec
.end() && dst_rc
.bytes() == it
->second
[idx
].regClass().bytes()) {
335 if (it
->second
[idx
].regClass() == dst_rc
) {
336 return it
->second
[idx
];
338 assert(!dst_rc
.is_subdword());
339 assert(dst_rc
.type() == RegType::vgpr
&& it
->second
[idx
].type() == RegType::sgpr
);
340 return bld
.copy(bld
.def(dst_rc
), it
->second
[idx
]);
344 if (dst_rc
.is_subdword())
345 src
= as_vgpr(ctx
, src
);
347 if (src
.bytes() == dst_rc
.bytes()) {
349 return bld
.copy(bld
.def(dst_rc
), src
);
351 Temp dst
= bld
.tmp(dst_rc
);
352 emit_extract_vector(ctx
, src
, idx
, dst
);
357 void emit_split_vector(isel_context
* ctx
, Temp vec_src
, unsigned num_components
)
359 if (num_components
== 1)
361 if (ctx
->allocated_vec
.find(vec_src
.id()) != ctx
->allocated_vec
.end())
364 if (num_components
> vec_src
.size()) {
365 if (vec_src
.type() == RegType::sgpr
) {
366 /* should still help get_alu_src() */
367 emit_split_vector(ctx
, vec_src
, vec_src
.size());
370 /* sub-dword split */
371 rc
= RegClass(RegType::vgpr
, vec_src
.bytes() / num_components
).as_subdword();
373 rc
= RegClass(vec_src
.type(), vec_src
.size() / num_components
);
375 aco_ptr
<Pseudo_instruction
> split
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_split_vector
, Format::PSEUDO
, 1, num_components
)};
376 split
->operands
[0] = Operand(vec_src
);
377 std::array
<Temp
,NIR_MAX_VEC_COMPONENTS
> elems
;
378 for (unsigned i
= 0; i
< num_components
; i
++) {
379 elems
[i
] = {ctx
->program
->allocateId(), rc
};
380 split
->definitions
[i
] = Definition(elems
[i
]);
382 ctx
->block
->instructions
.emplace_back(std::move(split
));
383 ctx
->allocated_vec
.emplace(vec_src
.id(), elems
);
386 /* This vector expansion uses a mask to determine which elements in the new vector
387 * come from the original vector. The other elements are undefined. */
388 void expand_vector(isel_context
* ctx
, Temp vec_src
, Temp dst
, unsigned num_components
, unsigned mask
)
390 emit_split_vector(ctx
, vec_src
, util_bitcount(mask
));
395 Builder
bld(ctx
->program
, ctx
->block
);
396 if (num_components
== 1) {
397 if (dst
.type() == RegType::sgpr
)
398 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), vec_src
);
400 bld
.copy(Definition(dst
), vec_src
);
404 unsigned component_size
= dst
.size() / num_components
;
405 std::array
<Temp
,NIR_MAX_VEC_COMPONENTS
> elems
;
407 aco_ptr
<Pseudo_instruction
> vec
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, num_components
, 1)};
408 vec
->definitions
[0] = Definition(dst
);
410 for (unsigned i
= 0; i
< num_components
; i
++) {
411 if (mask
& (1 << i
)) {
412 Temp src
= emit_extract_vector(ctx
, vec_src
, k
++, RegClass(vec_src
.type(), component_size
));
413 if (dst
.type() == RegType::sgpr
)
414 src
= bld
.as_uniform(src
);
415 vec
->operands
[i
] = Operand(src
);
417 vec
->operands
[i
] = Operand(0u);
419 elems
[i
] = vec
->operands
[i
].getTemp();
421 ctx
->block
->instructions
.emplace_back(std::move(vec
));
422 ctx
->allocated_vec
.emplace(dst
.id(), elems
);
425 /* adjust misaligned small bit size loads */
426 void byte_align_scalar(isel_context
*ctx
, Temp vec
, Operand offset
, Temp dst
)
428 Builder
bld(ctx
->program
, ctx
->block
);
430 Temp select
= Temp();
431 if (offset
.isConstant()) {
432 assert(offset
.constantValue() && offset
.constantValue() < 4);
433 shift
= Operand(offset
.constantValue() * 8);
435 /* bit_offset = 8 * (offset & 0x3) */
436 Temp tmp
= bld
.sop2(aco_opcode::s_and_b32
, bld
.def(s1
), bld
.def(s1
, scc
), offset
, Operand(3u));
437 select
= bld
.tmp(s1
);
438 shift
= bld
.sop2(aco_opcode::s_lshl_b32
, bld
.def(s1
), bld
.scc(Definition(select
)), tmp
, Operand(3u));
441 if (vec
.size() == 1) {
442 bld
.sop2(aco_opcode::s_lshr_b32
, Definition(dst
), bld
.def(s1
, scc
), vec
, shift
);
443 } else if (vec
.size() == 2) {
444 Temp tmp
= dst
.size() == 2 ? dst
: bld
.tmp(s2
);
445 bld
.sop2(aco_opcode::s_lshr_b64
, Definition(tmp
), bld
.def(s1
, scc
), vec
, shift
);
447 emit_split_vector(ctx
, dst
, 2);
449 emit_extract_vector(ctx
, tmp
, 0, dst
);
450 } else if (vec
.size() == 4) {
451 Temp lo
= bld
.tmp(s2
), hi
= bld
.tmp(s2
);
452 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lo
), Definition(hi
), vec
);
453 hi
= bld
.pseudo(aco_opcode::p_extract_vector
, bld
.def(s1
), hi
, Operand(0u));
454 if (select
!= Temp())
455 hi
= bld
.sop2(aco_opcode::s_cselect_b32
, bld
.def(s1
), hi
, Operand(0u), bld
.scc(select
));
456 lo
= bld
.sop2(aco_opcode::s_lshr_b64
, bld
.def(s2
), bld
.def(s1
, scc
), lo
, shift
);
457 Temp mid
= bld
.tmp(s1
);
458 lo
= bld
.pseudo(aco_opcode::p_split_vector
, bld
.def(s1
), Definition(mid
), lo
);
459 hi
= bld
.sop2(aco_opcode::s_lshl_b32
, bld
.def(s1
), bld
.def(s1
, scc
), hi
, shift
);
460 mid
= bld
.sop2(aco_opcode::s_or_b32
, bld
.def(s1
), bld
.def(s1
, scc
), hi
, mid
);
461 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lo
, mid
);
462 emit_split_vector(ctx
, dst
, 2);
466 void byte_align_vector(isel_context
*ctx
, Temp vec
, Operand offset
, Temp dst
, unsigned component_size
)
468 Builder
bld(ctx
->program
, ctx
->block
);
469 if (offset
.isTemp()) {
470 Temp tmp
[4] = {vec
, vec
, vec
, vec
};
472 if (vec
.size() == 4) {
473 tmp
[0] = bld
.tmp(v1
), tmp
[1] = bld
.tmp(v1
), tmp
[2] = bld
.tmp(v1
), tmp
[3] = bld
.tmp(v1
);
474 bld
.pseudo(aco_opcode::p_split_vector
, Definition(tmp
[0]), Definition(tmp
[1]), Definition(tmp
[2]), Definition(tmp
[3]), vec
);
475 } else if (vec
.size() == 3) {
476 tmp
[0] = bld
.tmp(v1
), tmp
[1] = bld
.tmp(v1
), tmp
[2] = bld
.tmp(v1
);
477 bld
.pseudo(aco_opcode::p_split_vector
, Definition(tmp
[0]), Definition(tmp
[1]), Definition(tmp
[2]), vec
);
478 } else if (vec
.size() == 2) {
479 tmp
[0] = bld
.tmp(v1
), tmp
[1] = bld
.tmp(v1
), tmp
[2] = tmp
[1];
480 bld
.pseudo(aco_opcode::p_split_vector
, Definition(tmp
[0]), Definition(tmp
[1]), vec
);
482 for (unsigned i
= 0; i
< dst
.size(); i
++)
483 tmp
[i
] = bld
.vop3(aco_opcode::v_alignbyte_b32
, bld
.def(v1
), tmp
[i
+ 1], tmp
[i
], offset
);
487 vec
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), tmp
[0], tmp
[1]);
489 offset
= Operand(0u);
492 unsigned num_components
= vec
.bytes() / component_size
;
493 if (vec
.regClass() == dst
.regClass()) {
494 assert(offset
.constantValue() == 0);
495 bld
.copy(Definition(dst
), vec
);
496 emit_split_vector(ctx
, dst
, num_components
);
500 emit_split_vector(ctx
, vec
, num_components
);
501 std::array
<Temp
, NIR_MAX_VEC_COMPONENTS
> elems
;
502 RegClass rc
= RegClass(RegType::vgpr
, component_size
).as_subdword();
504 assert(offset
.constantValue() % component_size
== 0);
505 unsigned skip
= offset
.constantValue() / component_size
;
506 for (unsigned i
= skip
; i
< num_components
; i
++)
507 elems
[i
- skip
] = emit_extract_vector(ctx
, vec
, i
, rc
);
509 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
510 if (dst
.type() == RegType::vgpr
) {
511 num_components
= dst
.bytes() / component_size
;
512 aco_ptr
<Pseudo_instruction
> create_vec
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, num_components
, 1)};
513 for (unsigned i
= 0; i
< num_components
; i
++)
514 create_vec
->operands
[i
] = Operand(elems
[i
]);
515 create_vec
->definitions
[0] = Definition(dst
);
516 bld
.insert(std::move(create_vec
));
518 /* if dst is sgpr - split the src, but move the original to sgpr. */
520 vec
= bld
.pseudo(aco_opcode::p_as_uniform
, bld
.def(RegClass(RegType::sgpr
, vec
.size())), vec
);
521 byte_align_scalar(ctx
, vec
, offset
, dst
);
523 assert(dst
.size() == vec
.size());
524 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), vec
);
527 ctx
->allocated_vec
.emplace(dst
.id(), elems
);
530 Temp
bool_to_vector_condition(isel_context
*ctx
, Temp val
, Temp dst
= Temp(0, s2
))
532 Builder
bld(ctx
->program
, ctx
->block
);
534 dst
= bld
.tmp(bld
.lm
);
536 assert(val
.regClass() == s1
);
537 assert(dst
.regClass() == bld
.lm
);
539 return bld
.sop2(Builder::s_cselect
, Definition(dst
), Operand((uint32_t) -1), Operand(0u), bld
.scc(val
));
542 Temp
bool_to_scalar_condition(isel_context
*ctx
, Temp val
, Temp dst
= Temp(0, s1
))
544 Builder
bld(ctx
->program
, ctx
->block
);
548 assert(val
.regClass() == bld
.lm
);
549 assert(dst
.regClass() == s1
);
551 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
552 Temp tmp
= bld
.tmp(s1
);
553 bld
.sop2(Builder::s_and
, bld
.def(bld
.lm
), bld
.scc(Definition(tmp
)), val
, Operand(exec
, bld
.lm
));
554 return emit_wqm(ctx
, tmp
, dst
);
557 Temp
convert_int(isel_context
*ctx
, Builder
& bld
, Temp src
, unsigned src_bits
, unsigned dst_bits
, bool is_signed
, Temp dst
=Temp())
560 if (dst_bits
% 32 == 0 || src
.type() == RegType::sgpr
)
561 dst
= bld
.tmp(src
.type(), DIV_ROUND_UP(dst_bits
, 32u));
563 dst
= bld
.tmp(RegClass(RegType::vgpr
, dst_bits
/ 8u).as_subdword());
566 if (dst
.bytes() == src
.bytes() && dst_bits
< src_bits
)
567 return bld
.copy(Definition(dst
), src
);
568 else if (dst
.bytes() < src
.bytes())
569 return bld
.pseudo(aco_opcode::p_extract_vector
, Definition(dst
), src
, Operand(0u));
573 tmp
= src_bits
== 32 ? src
: bld
.tmp(src
.type(), 1);
576 } else if (src
.regClass() == s1
) {
578 bld
.sop1(src_bits
== 8 ? aco_opcode::s_sext_i32_i8
: aco_opcode::s_sext_i32_i16
, Definition(tmp
), src
);
580 bld
.sop2(aco_opcode::s_and_b32
, Definition(tmp
), bld
.def(s1
, scc
), Operand(src_bits
== 8 ? 0xFFu
: 0xFFFFu
), src
);
581 } else if (ctx
->options
->chip_class
>= GFX8
) {
582 assert(src_bits
!= 8 || src
.regClass() == v1b
);
583 assert(src_bits
!= 16 || src
.regClass() == v2b
);
584 aco_ptr
<SDWA_instruction
> sdwa
{create_instruction
<SDWA_instruction
>(aco_opcode::v_mov_b32
, asSDWA(Format::VOP1
), 1, 1)};
585 sdwa
->operands
[0] = Operand(src
);
586 sdwa
->definitions
[0] = Definition(tmp
);
588 sdwa
->sel
[0] = src_bits
== 8 ? sdwa_sbyte
: sdwa_sword
;
590 sdwa
->sel
[0] = src_bits
== 8 ? sdwa_ubyte
: sdwa_uword
;
591 sdwa
->dst_sel
= tmp
.bytes() == 2 ? sdwa_uword
: sdwa_udword
;
592 bld
.insert(std::move(sdwa
));
594 assert(ctx
->options
->chip_class
== GFX6
|| ctx
->options
->chip_class
== GFX7
);
595 aco_opcode opcode
= is_signed
? aco_opcode::v_bfe_i32
: aco_opcode::v_bfe_u32
;
596 bld
.vop3(opcode
, Definition(tmp
), src
, Operand(0u), Operand(src_bits
== 8 ? 8u : 16u));
599 if (dst_bits
== 64) {
600 if (is_signed
&& dst
.regClass() == s2
) {
601 Temp high
= bld
.sop2(aco_opcode::s_ashr_i32
, bld
.def(s1
), bld
.def(s1
, scc
), tmp
, Operand(31u));
602 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), tmp
, high
);
603 } else if (is_signed
&& dst
.regClass() == v2
) {
604 Temp high
= bld
.vop2(aco_opcode::v_ashrrev_i32
, bld
.def(v1
), Operand(31u), tmp
);
605 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), tmp
, high
);
607 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), tmp
, Operand(0u));
614 enum sgpr_extract_mode
{
620 Temp
extract_8_16_bit_sgpr_element(isel_context
*ctx
, Temp dst
, nir_alu_src
*src
, sgpr_extract_mode mode
)
622 Temp vec
= get_ssa_temp(ctx
, src
->src
.ssa
);
623 unsigned src_size
= src
->src
.ssa
->bit_size
;
624 unsigned swizzle
= src
->swizzle
[0];
626 if (vec
.size() > 1) {
627 assert(src_size
== 16);
628 vec
= emit_extract_vector(ctx
, vec
, swizzle
/ 2, s1
);
629 swizzle
= swizzle
& 1;
632 Builder
bld(ctx
->program
, ctx
->block
);
633 unsigned offset
= src_size
* swizzle
;
634 Temp tmp
= dst
.regClass() == s2
? bld
.tmp(s1
) : dst
;
636 if (mode
== sgpr_extract_undef
&& swizzle
== 0) {
637 bld
.copy(Definition(tmp
), vec
);
638 } else if (mode
== sgpr_extract_undef
|| (offset
== 24 && mode
== sgpr_extract_zext
)) {
639 bld
.sop2(aco_opcode::s_lshr_b32
, Definition(tmp
), bld
.def(s1
, scc
), vec
, Operand(offset
));
640 } else if (src_size
== 8 && swizzle
== 0 && mode
== sgpr_extract_sext
) {
641 bld
.sop1(aco_opcode::s_sext_i32_i8
, Definition(tmp
), vec
);
642 } else if (src_size
== 16 && swizzle
== 0 && mode
== sgpr_extract_sext
) {
643 bld
.sop1(aco_opcode::s_sext_i32_i16
, Definition(tmp
), vec
);
645 aco_opcode op
= mode
== sgpr_extract_zext
? aco_opcode::s_bfe_u32
: aco_opcode::s_bfe_i32
;
646 bld
.sop2(op
, Definition(tmp
), bld
.def(s1
, scc
), vec
, Operand((src_size
<< 16) | offset
));
649 if (dst
.regClass() == s2
)
650 convert_int(ctx
, bld
, tmp
, 32, 64, mode
== sgpr_extract_sext
, dst
);
655 Temp
get_alu_src(struct isel_context
*ctx
, nir_alu_src src
, unsigned size
=1)
657 if (src
.src
.ssa
->num_components
== 1 && src
.swizzle
[0] == 0 && size
== 1)
658 return get_ssa_temp(ctx
, src
.src
.ssa
);
660 if (src
.src
.ssa
->num_components
== size
) {
661 bool identity_swizzle
= true;
662 for (unsigned i
= 0; identity_swizzle
&& i
< size
; i
++) {
663 if (src
.swizzle
[i
] != i
)
664 identity_swizzle
= false;
666 if (identity_swizzle
)
667 return get_ssa_temp(ctx
, src
.src
.ssa
);
670 Temp vec
= get_ssa_temp(ctx
, src
.src
.ssa
);
671 unsigned elem_size
= vec
.bytes() / src
.src
.ssa
->num_components
;
672 assert(elem_size
> 0);
673 assert(vec
.bytes() % elem_size
== 0);
675 if (elem_size
< 4 && vec
.type() == RegType::sgpr
) {
676 assert(src
.src
.ssa
->bit_size
== 8 || src
.src
.ssa
->bit_size
== 16);
678 return extract_8_16_bit_sgpr_element(
679 ctx
, Temp(ctx
->program
->allocateId(), s1
), &src
, sgpr_extract_undef
);
682 RegClass elem_rc
= elem_size
< 4 ? RegClass(vec
.type(), elem_size
).as_subdword() : RegClass(vec
.type(), elem_size
/ 4);
684 return emit_extract_vector(ctx
, vec
, src
.swizzle
[0], elem_rc
);
687 std::array
<Temp
,NIR_MAX_VEC_COMPONENTS
> elems
;
688 aco_ptr
<Pseudo_instruction
> vec_instr
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, size
, 1)};
689 for (unsigned i
= 0; i
< size
; ++i
) {
690 elems
[i
] = emit_extract_vector(ctx
, vec
, src
.swizzle
[i
], elem_rc
);
691 vec_instr
->operands
[i
] = Operand
{elems
[i
]};
693 Temp dst
{ctx
->program
->allocateId(), RegClass(vec
.type(), elem_size
* size
/ 4)};
694 vec_instr
->definitions
[0] = Definition(dst
);
695 ctx
->block
->instructions
.emplace_back(std::move(vec_instr
));
696 ctx
->allocated_vec
.emplace(dst
.id(), elems
);
701 Temp
convert_pointer_to_64_bit(isel_context
*ctx
, Temp ptr
)
705 Builder
bld(ctx
->program
, ctx
->block
);
706 if (ptr
.type() == RegType::vgpr
)
707 ptr
= bld
.vop1(aco_opcode::v_readfirstlane_b32
, bld
.def(s1
), ptr
);
708 return bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(s2
),
709 ptr
, Operand((unsigned)ctx
->options
->address32_hi
));
712 void emit_sop2_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
, bool writes_scc
)
714 aco_ptr
<SOP2_instruction
> sop2
{create_instruction
<SOP2_instruction
>(op
, Format::SOP2
, 2, writes_scc
? 2 : 1)};
715 sop2
->operands
[0] = Operand(get_alu_src(ctx
, instr
->src
[0]));
716 sop2
->operands
[1] = Operand(get_alu_src(ctx
, instr
->src
[1]));
717 sop2
->definitions
[0] = Definition(dst
);
718 if (instr
->no_unsigned_wrap
)
719 sop2
->definitions
[0].setNUW(true);
721 sop2
->definitions
[1] = Definition(ctx
->program
->allocateId(), scc
, s1
);
722 ctx
->block
->instructions
.emplace_back(std::move(sop2
));
725 void emit_vop2_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
,
726 bool commutative
, bool swap_srcs
=false, bool flush_denorms
= false)
728 Builder
bld(ctx
->program
, ctx
->block
);
729 bld
.is_precise
= instr
->exact
;
731 Temp src0
= get_alu_src(ctx
, instr
->src
[swap_srcs
? 1 : 0]);
732 Temp src1
= get_alu_src(ctx
, instr
->src
[swap_srcs
? 0 : 1]);
733 if (src1
.type() == RegType::sgpr
) {
734 if (commutative
&& src0
.type() == RegType::vgpr
) {
739 src1
= as_vgpr(ctx
, src1
);
743 if (flush_denorms
&& ctx
->program
->chip_class
< GFX9
) {
744 assert(dst
.size() == 1);
745 Temp tmp
= bld
.vop2(op
, bld
.def(v1
), src0
, src1
);
746 bld
.vop2(aco_opcode::v_mul_f32
, Definition(dst
), Operand(0x3f800000u
), tmp
);
748 bld
.vop2(op
, Definition(dst
), src0
, src1
);
752 void emit_vop2_instruction_logic64(isel_context
*ctx
, nir_alu_instr
*instr
,
753 aco_opcode op
, Temp dst
)
755 Builder
bld(ctx
->program
, ctx
->block
);
756 bld
.is_precise
= instr
->exact
;
758 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
759 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
761 if (src1
.type() == RegType::sgpr
) {
762 assert(src0
.type() == RegType::vgpr
);
763 std::swap(src0
, src1
);
766 Temp src00
= bld
.tmp(src0
.type(), 1);
767 Temp src01
= bld
.tmp(src0
.type(), 1);
768 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
769 Temp src10
= bld
.tmp(v1
);
770 Temp src11
= bld
.tmp(v1
);
771 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
772 Temp lo
= bld
.vop2(op
, bld
.def(v1
), src00
, src10
);
773 Temp hi
= bld
.vop2(op
, bld
.def(v1
), src01
, src11
);
774 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lo
, hi
);
777 void emit_vop3a_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
,
778 bool flush_denorms
= false)
780 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
781 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
782 Temp src2
= get_alu_src(ctx
, instr
->src
[2]);
784 /* ensure that the instruction has at most 1 sgpr operand
785 * The optimizer will inline constants for us */
786 if (src0
.type() == RegType::sgpr
&& src1
.type() == RegType::sgpr
)
787 src0
= as_vgpr(ctx
, src0
);
788 if (src1
.type() == RegType::sgpr
&& src2
.type() == RegType::sgpr
)
789 src1
= as_vgpr(ctx
, src1
);
790 if (src2
.type() == RegType::sgpr
&& src0
.type() == RegType::sgpr
)
791 src2
= as_vgpr(ctx
, src2
);
793 Builder
bld(ctx
->program
, ctx
->block
);
794 bld
.is_precise
= instr
->exact
;
795 if (flush_denorms
&& ctx
->program
->chip_class
< GFX9
) {
796 assert(dst
.size() == 1);
797 Temp tmp
= bld
.vop3(op
, Definition(dst
), src0
, src1
, src2
);
798 bld
.vop2(aco_opcode::v_mul_f32
, Definition(dst
), Operand(0x3f800000u
), tmp
);
800 bld
.vop3(op
, Definition(dst
), src0
, src1
, src2
);
804 void emit_vop1_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
)
806 Builder
bld(ctx
->program
, ctx
->block
);
807 bld
.is_precise
= instr
->exact
;
808 if (dst
.type() == RegType::sgpr
)
809 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
),
810 bld
.vop1(op
, bld
.def(RegType::vgpr
, dst
.size()), get_alu_src(ctx
, instr
->src
[0])));
812 bld
.vop1(op
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]));
815 void emit_vopc_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
)
817 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
818 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
819 assert(src0
.size() == src1
.size());
821 aco_ptr
<Instruction
> vopc
;
822 if (src1
.type() == RegType::sgpr
) {
823 if (src0
.type() == RegType::vgpr
) {
824 /* to swap the operands, we might also have to change the opcode */
826 case aco_opcode::v_cmp_lt_f16
:
827 op
= aco_opcode::v_cmp_gt_f16
;
829 case aco_opcode::v_cmp_ge_f16
:
830 op
= aco_opcode::v_cmp_le_f16
;
832 case aco_opcode::v_cmp_lt_i16
:
833 op
= aco_opcode::v_cmp_gt_i16
;
835 case aco_opcode::v_cmp_ge_i16
:
836 op
= aco_opcode::v_cmp_le_i16
;
838 case aco_opcode::v_cmp_lt_u16
:
839 op
= aco_opcode::v_cmp_gt_u16
;
841 case aco_opcode::v_cmp_ge_u16
:
842 op
= aco_opcode::v_cmp_le_u16
;
844 case aco_opcode::v_cmp_lt_f32
:
845 op
= aco_opcode::v_cmp_gt_f32
;
847 case aco_opcode::v_cmp_ge_f32
:
848 op
= aco_opcode::v_cmp_le_f32
;
850 case aco_opcode::v_cmp_lt_i32
:
851 op
= aco_opcode::v_cmp_gt_i32
;
853 case aco_opcode::v_cmp_ge_i32
:
854 op
= aco_opcode::v_cmp_le_i32
;
856 case aco_opcode::v_cmp_lt_u32
:
857 op
= aco_opcode::v_cmp_gt_u32
;
859 case aco_opcode::v_cmp_ge_u32
:
860 op
= aco_opcode::v_cmp_le_u32
;
862 case aco_opcode::v_cmp_lt_f64
:
863 op
= aco_opcode::v_cmp_gt_f64
;
865 case aco_opcode::v_cmp_ge_f64
:
866 op
= aco_opcode::v_cmp_le_f64
;
868 case aco_opcode::v_cmp_lt_i64
:
869 op
= aco_opcode::v_cmp_gt_i64
;
871 case aco_opcode::v_cmp_ge_i64
:
872 op
= aco_opcode::v_cmp_le_i64
;
874 case aco_opcode::v_cmp_lt_u64
:
875 op
= aco_opcode::v_cmp_gt_u64
;
877 case aco_opcode::v_cmp_ge_u64
:
878 op
= aco_opcode::v_cmp_le_u64
;
880 default: /* eq and ne are commutative */
887 src1
= as_vgpr(ctx
, src1
);
891 Builder
bld(ctx
->program
, ctx
->block
);
892 bld
.vopc(op
, bld
.hint_vcc(Definition(dst
)), src0
, src1
);
895 void emit_sopc_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
)
897 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
898 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
899 Builder
bld(ctx
->program
, ctx
->block
);
901 assert(dst
.regClass() == bld
.lm
);
902 assert(src0
.type() == RegType::sgpr
);
903 assert(src1
.type() == RegType::sgpr
);
904 assert(src0
.regClass() == src1
.regClass());
906 /* Emit the SALU comparison instruction */
907 Temp cmp
= bld
.sopc(op
, bld
.scc(bld
.def(s1
)), src0
, src1
);
908 /* Turn the result into a per-lane bool */
909 bool_to_vector_condition(ctx
, cmp
, dst
);
912 void emit_comparison(isel_context
*ctx
, nir_alu_instr
*instr
, Temp dst
,
913 aco_opcode v16_op
, aco_opcode v32_op
, aco_opcode v64_op
, aco_opcode s32_op
= aco_opcode::num_opcodes
, aco_opcode s64_op
= aco_opcode::num_opcodes
)
915 aco_opcode s_op
= instr
->src
[0].src
.ssa
->bit_size
== 64 ? s64_op
: instr
->src
[0].src
.ssa
->bit_size
== 32 ? s32_op
: aco_opcode::num_opcodes
;
916 aco_opcode v_op
= instr
->src
[0].src
.ssa
->bit_size
== 64 ? v64_op
: instr
->src
[0].src
.ssa
->bit_size
== 32 ? v32_op
: v16_op
;
917 bool use_valu
= s_op
== aco_opcode::num_opcodes
||
918 nir_dest_is_divergent(instr
->dest
.dest
) ||
919 ctx
->allocated
[instr
->src
[0].src
.ssa
->index
].type() == RegType::vgpr
||
920 ctx
->allocated
[instr
->src
[1].src
.ssa
->index
].type() == RegType::vgpr
;
921 aco_opcode op
= use_valu
? v_op
: s_op
;
922 assert(op
!= aco_opcode::num_opcodes
);
923 assert(dst
.regClass() == ctx
->program
->lane_mask
);
926 emit_vopc_instruction(ctx
, instr
, op
, dst
);
928 emit_sopc_instruction(ctx
, instr
, op
, dst
);
931 void emit_boolean_logic(isel_context
*ctx
, nir_alu_instr
*instr
, Builder::WaveSpecificOpcode op
, Temp dst
)
933 Builder
bld(ctx
->program
, ctx
->block
);
934 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
935 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
937 assert(dst
.regClass() == bld
.lm
);
938 assert(src0
.regClass() == bld
.lm
);
939 assert(src1
.regClass() == bld
.lm
);
941 bld
.sop2(op
, Definition(dst
), bld
.def(s1
, scc
), src0
, src1
);
944 void emit_bcsel(isel_context
*ctx
, nir_alu_instr
*instr
, Temp dst
)
946 Builder
bld(ctx
->program
, ctx
->block
);
947 Temp cond
= get_alu_src(ctx
, instr
->src
[0]);
948 Temp then
= get_alu_src(ctx
, instr
->src
[1]);
949 Temp els
= get_alu_src(ctx
, instr
->src
[2]);
951 assert(cond
.regClass() == bld
.lm
);
953 if (dst
.type() == RegType::vgpr
) {
954 aco_ptr
<Instruction
> bcsel
;
955 if (dst
.size() == 1) {
956 then
= as_vgpr(ctx
, then
);
957 els
= as_vgpr(ctx
, els
);
959 bld
.vop2(aco_opcode::v_cndmask_b32
, Definition(dst
), els
, then
, cond
);
960 } else if (dst
.size() == 2) {
961 Temp then_lo
= bld
.tmp(v1
), then_hi
= bld
.tmp(v1
);
962 bld
.pseudo(aco_opcode::p_split_vector
, Definition(then_lo
), Definition(then_hi
), then
);
963 Temp else_lo
= bld
.tmp(v1
), else_hi
= bld
.tmp(v1
);
964 bld
.pseudo(aco_opcode::p_split_vector
, Definition(else_lo
), Definition(else_hi
), els
);
966 Temp dst0
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), else_lo
, then_lo
, cond
);
967 Temp dst1
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), else_hi
, then_hi
, cond
);
969 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
971 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
976 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
977 assert(dst
.regClass() == bld
.lm
);
978 assert(then
.regClass() == bld
.lm
);
979 assert(els
.regClass() == bld
.lm
);
982 if (!nir_src_is_divergent(instr
->src
[0].src
)) { /* uniform condition and values in sgpr */
983 if (dst
.regClass() == s1
|| dst
.regClass() == s2
) {
984 assert((then
.regClass() == s1
|| then
.regClass() == s2
) && els
.regClass() == then
.regClass());
985 assert(dst
.size() == then
.size());
986 aco_opcode op
= dst
.regClass() == s1
? aco_opcode::s_cselect_b32
: aco_opcode::s_cselect_b64
;
987 bld
.sop2(op
, Definition(dst
), then
, els
, bld
.scc(bool_to_scalar_condition(ctx
, cond
)));
989 isel_err(&instr
->instr
, "Unimplemented uniform bcsel bit size");
994 /* divergent boolean bcsel
995 * this implements bcsel on bools: dst = s0 ? s1 : s2
996 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
997 assert(instr
->dest
.dest
.ssa
.bit_size
== 1);
999 if (cond
.id() != then
.id())
1000 then
= bld
.sop2(Builder::s_and
, bld
.def(bld
.lm
), bld
.def(s1
, scc
), cond
, then
);
1002 if (cond
.id() == els
.id())
1003 bld
.sop1(Builder::s_mov
, Definition(dst
), then
);
1005 bld
.sop2(Builder::s_or
, Definition(dst
), bld
.def(s1
, scc
), then
,
1006 bld
.sop2(Builder::s_andn2
, bld
.def(bld
.lm
), bld
.def(s1
, scc
), els
, cond
));
1009 void emit_scaled_op(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
,
1010 aco_opcode op
, uint32_t undo
)
1012 /* multiply by 16777216 to handle denormals */
1013 Temp is_denormal
= bld
.vopc(aco_opcode::v_cmp_class_f32
, bld
.hint_vcc(bld
.def(bld
.lm
)),
1014 as_vgpr(ctx
, val
), bld
.copy(bld
.def(v1
), Operand((1u << 7) | (1u << 4))));
1015 Temp scaled
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), Operand(0x4b800000u
), val
);
1016 scaled
= bld
.vop1(op
, bld
.def(v1
), scaled
);
1017 scaled
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), Operand(undo
), scaled
);
1019 Temp not_scaled
= bld
.vop1(op
, bld
.def(v1
), val
);
1021 bld
.vop2(aco_opcode::v_cndmask_b32
, dst
, not_scaled
, scaled
, is_denormal
);
1024 void emit_rcp(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
1026 if (ctx
->block
->fp_mode
.denorm32
== 0) {
1027 bld
.vop1(aco_opcode::v_rcp_f32
, dst
, val
);
1031 emit_scaled_op(ctx
, bld
, dst
, val
, aco_opcode::v_rcp_f32
, 0x4b800000u
);
1034 void emit_rsq(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
1036 if (ctx
->block
->fp_mode
.denorm32
== 0) {
1037 bld
.vop1(aco_opcode::v_rsq_f32
, dst
, val
);
1041 emit_scaled_op(ctx
, bld
, dst
, val
, aco_opcode::v_rsq_f32
, 0x45800000u
);
1044 void emit_sqrt(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
1046 if (ctx
->block
->fp_mode
.denorm32
== 0) {
1047 bld
.vop1(aco_opcode::v_sqrt_f32
, dst
, val
);
1051 emit_scaled_op(ctx
, bld
, dst
, val
, aco_opcode::v_sqrt_f32
, 0x39800000u
);
1054 void emit_log2(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
1056 if (ctx
->block
->fp_mode
.denorm32
== 0) {
1057 bld
.vop1(aco_opcode::v_log_f32
, dst
, val
);
1061 emit_scaled_op(ctx
, bld
, dst
, val
, aco_opcode::v_log_f32
, 0xc1c00000u
);
1064 Temp
emit_trunc_f64(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
1066 if (ctx
->options
->chip_class
>= GFX7
)
1067 return bld
.vop1(aco_opcode::v_trunc_f64
, Definition(dst
), val
);
1069 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1070 /* TODO: create more efficient code! */
1071 if (val
.type() == RegType::sgpr
)
1072 val
= as_vgpr(ctx
, val
);
1074 /* Split the input value. */
1075 Temp val_lo
= bld
.tmp(v1
), val_hi
= bld
.tmp(v1
);
1076 bld
.pseudo(aco_opcode::p_split_vector
, Definition(val_lo
), Definition(val_hi
), val
);
1078 /* Extract the exponent and compute the unbiased value. */
1079 Temp exponent
= bld
.vop3(aco_opcode::v_bfe_u32
, bld
.def(v1
), val_hi
, Operand(20u), Operand(11u));
1080 exponent
= bld
.vsub32(bld
.def(v1
), exponent
, Operand(1023u));
1082 /* Extract the fractional part. */
1083 Temp fract_mask
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(-1u), Operand(0x000fffffu
));
1084 fract_mask
= bld
.vop3(aco_opcode::v_lshr_b64
, bld
.def(v2
), fract_mask
, exponent
);
1086 Temp fract_mask_lo
= bld
.tmp(v1
), fract_mask_hi
= bld
.tmp(v1
);
1087 bld
.pseudo(aco_opcode::p_split_vector
, Definition(fract_mask_lo
), Definition(fract_mask_hi
), fract_mask
);
1089 Temp fract_lo
= bld
.tmp(v1
), fract_hi
= bld
.tmp(v1
);
1090 Temp tmp
= bld
.vop1(aco_opcode::v_not_b32
, bld
.def(v1
), fract_mask_lo
);
1091 fract_lo
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), val_lo
, tmp
);
1092 tmp
= bld
.vop1(aco_opcode::v_not_b32
, bld
.def(v1
), fract_mask_hi
);
1093 fract_hi
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), val_hi
, tmp
);
1095 /* Get the sign bit. */
1096 Temp sign
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), Operand(0x80000000u
), val_hi
);
1098 /* Decide the operation to apply depending on the unbiased exponent. */
1099 Temp exp_lt0
= bld
.vopc_e64(aco_opcode::v_cmp_lt_i32
, bld
.hint_vcc(bld
.def(bld
.lm
)), exponent
, Operand(0u));
1100 Temp dst_lo
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), fract_lo
, bld
.copy(bld
.def(v1
), Operand(0u)), exp_lt0
);
1101 Temp dst_hi
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), fract_hi
, sign
, exp_lt0
);
1102 Temp exp_gt51
= bld
.vopc_e64(aco_opcode::v_cmp_gt_i32
, bld
.def(s2
), exponent
, Operand(51u));
1103 dst_lo
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), dst_lo
, val_lo
, exp_gt51
);
1104 dst_hi
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), dst_hi
, val_hi
, exp_gt51
);
1106 return bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst_lo
, dst_hi
);
1109 Temp
emit_floor_f64(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
1111 if (ctx
->options
->chip_class
>= GFX7
)
1112 return bld
.vop1(aco_opcode::v_floor_f64
, Definition(dst
), val
);
1114 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1115 * lowered at NIR level for precision reasons). */
1116 Temp src0
= as_vgpr(ctx
, val
);
1118 Temp mask
= bld
.copy(bld
.def(s1
), Operand(3u)); /* isnan */
1119 Temp min_val
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(s2
), Operand(-1u), Operand(0x3fefffffu
));
1121 Temp isnan
= bld
.vopc_e64(aco_opcode::v_cmp_class_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), src0
, mask
);
1122 Temp fract
= bld
.vop1(aco_opcode::v_fract_f64
, bld
.def(v2
), src0
);
1123 Temp min
= bld
.vop3(aco_opcode::v_min_f64
, bld
.def(v2
), fract
, min_val
);
1125 Temp then_lo
= bld
.tmp(v1
), then_hi
= bld
.tmp(v1
);
1126 bld
.pseudo(aco_opcode::p_split_vector
, Definition(then_lo
), Definition(then_hi
), src0
);
1127 Temp else_lo
= bld
.tmp(v1
), else_hi
= bld
.tmp(v1
);
1128 bld
.pseudo(aco_opcode::p_split_vector
, Definition(else_lo
), Definition(else_hi
), min
);
1130 Temp dst0
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), else_lo
, then_lo
, isnan
);
1131 Temp dst1
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), else_hi
, then_hi
, isnan
);
1133 Temp v
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), dst0
, dst1
);
1135 Instruction
* add
= bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), src0
, v
);
1136 static_cast<VOP3A_instruction
*>(add
)->neg
[1] = true;
1138 return add
->definitions
[0].getTemp();
1141 void visit_alu_instr(isel_context
*ctx
, nir_alu_instr
*instr
)
1143 if (!instr
->dest
.dest
.is_ssa
) {
1144 isel_err(&instr
->instr
, "nir alu dst not in ssa");
1147 Builder
bld(ctx
->program
, ctx
->block
);
1148 bld
.is_precise
= instr
->exact
;
1149 Temp dst
= get_ssa_temp(ctx
, &instr
->dest
.dest
.ssa
);
1154 std::array
<Temp
,NIR_MAX_VEC_COMPONENTS
> elems
;
1155 unsigned num
= instr
->dest
.dest
.ssa
.num_components
;
1156 for (unsigned i
= 0; i
< num
; ++i
)
1157 elems
[i
] = get_alu_src(ctx
, instr
->src
[i
]);
1159 if (instr
->dest
.dest
.ssa
.bit_size
>= 32 || dst
.type() == RegType::vgpr
) {
1160 aco_ptr
<Pseudo_instruction
> vec
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, instr
->dest
.dest
.ssa
.num_components
, 1)};
1161 RegClass elem_rc
= RegClass::get(RegType::vgpr
, instr
->dest
.dest
.ssa
.bit_size
/ 8u);
1162 for (unsigned i
= 0; i
< num
; ++i
) {
1163 if (elems
[i
].type() == RegType::sgpr
&& elem_rc
.is_subdword())
1164 vec
->operands
[i
] = Operand(emit_extract_vector(ctx
, elems
[i
], 0, elem_rc
));
1166 vec
->operands
[i
] = Operand
{elems
[i
]};
1168 vec
->definitions
[0] = Definition(dst
);
1169 ctx
->block
->instructions
.emplace_back(std::move(vec
));
1170 ctx
->allocated_vec
.emplace(dst
.id(), elems
);
1172 // TODO: that is a bit suboptimal..
1173 Temp mask
= bld
.copy(bld
.def(s1
), Operand((1u << instr
->dest
.dest
.ssa
.bit_size
) - 1));
1174 for (unsigned i
= 0; i
< num
- 1; ++i
)
1175 if (((i
+1) * instr
->dest
.dest
.ssa
.bit_size
) % 32)
1176 elems
[i
] = bld
.sop2(aco_opcode::s_and_b32
, bld
.def(s1
), bld
.def(s1
, scc
), elems
[i
], mask
);
1177 for (unsigned i
= 0; i
< num
; ++i
) {
1178 unsigned bit
= i
* instr
->dest
.dest
.ssa
.bit_size
;
1179 if (bit
% 32 == 0) {
1180 elems
[bit
/ 32] = elems
[i
];
1182 elems
[i
] = bld
.sop2(aco_opcode::s_lshl_b32
, bld
.def(s1
), bld
.def(s1
, scc
),
1183 elems
[i
], Operand((i
* instr
->dest
.dest
.ssa
.bit_size
) % 32));
1184 elems
[bit
/ 32] = bld
.sop2(aco_opcode::s_or_b32
, bld
.def(s1
), bld
.def(s1
, scc
), elems
[bit
/ 32], elems
[i
]);
1187 if (dst
.size() == 1)
1188 bld
.copy(Definition(dst
), elems
[0]);
1190 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), elems
[0], elems
[1]);
1195 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1196 aco_ptr
<Instruction
> mov
;
1197 if (dst
.type() == RegType::sgpr
) {
1198 if (src
.type() == RegType::vgpr
)
1199 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), src
);
1200 else if (src
.regClass() == s1
)
1201 bld
.sop1(aco_opcode::s_mov_b32
, Definition(dst
), src
);
1202 else if (src
.regClass() == s2
)
1203 bld
.sop1(aco_opcode::s_mov_b64
, Definition(dst
), src
);
1205 unreachable("wrong src register class for nir_op_imov");
1207 if (dst
.regClass() == v1
)
1208 bld
.vop1(aco_opcode::v_mov_b32
, Definition(dst
), src
);
1209 else if (dst
.regClass() == v1b
||
1210 dst
.regClass() == v2b
||
1211 dst
.regClass() == v2
)
1212 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), src
);
1214 unreachable("wrong src register class for nir_op_imov");
1219 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1220 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
1221 assert(src
.regClass() == bld
.lm
);
1222 assert(dst
.regClass() == bld
.lm
);
1223 /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1224 Temp tmp
= bld
.sop1(Builder::s_not
, bld
.def(bld
.lm
), bld
.def(s1
, scc
), src
);
1225 bld
.sop2(Builder::s_and
, Definition(dst
), bld
.def(s1
, scc
), tmp
, Operand(exec
, bld
.lm
));
1226 } else if (dst
.regClass() == v1
) {
1227 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_not_b32
, dst
);
1228 } else if (dst
.regClass() == v2
) {
1229 Temp lo
= bld
.tmp(v1
), hi
= bld
.tmp(v1
);
1230 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lo
), Definition(hi
), src
);
1231 lo
= bld
.vop1(aco_opcode::v_not_b32
, bld
.def(v1
), lo
);
1232 hi
= bld
.vop1(aco_opcode::v_not_b32
, bld
.def(v1
), hi
);
1233 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lo
, hi
);
1234 } else if (dst
.type() == RegType::sgpr
) {
1235 aco_opcode opcode
= dst
.size() == 1 ? aco_opcode::s_not_b32
: aco_opcode::s_not_b64
;
1236 bld
.sop1(opcode
, Definition(dst
), bld
.def(s1
, scc
), src
);
1238 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1243 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1244 if (dst
.regClass() == v1
) {
1245 bld
.vsub32(Definition(dst
), Operand(0u), Operand(src
));
1246 } else if (dst
.regClass() == s1
) {
1247 bld
.sop2(aco_opcode::s_mul_i32
, Definition(dst
), Operand((uint32_t) -1), src
);
1248 } else if (dst
.size() == 2) {
1249 Temp src0
= bld
.tmp(dst
.type(), 1);
1250 Temp src1
= bld
.tmp(dst
.type(), 1);
1251 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src0
), Definition(src1
), src
);
1253 if (dst
.regClass() == s2
) {
1254 Temp carry
= bld
.tmp(s1
);
1255 Temp dst0
= bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(carry
)), Operand(0u), src0
);
1256 Temp dst1
= bld
.sop2(aco_opcode::s_subb_u32
, bld
.def(s1
), bld
.def(s1
, scc
), Operand(0u), src1
, carry
);
1257 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
1259 Temp lower
= bld
.tmp(v1
);
1260 Temp borrow
= bld
.vsub32(Definition(lower
), Operand(0u), src0
, true).def(1).getTemp();
1261 Temp upper
= bld
.vsub32(bld
.def(v1
), Operand(0u), src1
, false, borrow
);
1262 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1265 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1270 if (dst
.regClass() == s1
) {
1271 bld
.sop1(aco_opcode::s_abs_i32
, Definition(dst
), bld
.def(s1
, scc
), get_alu_src(ctx
, instr
->src
[0]));
1272 } else if (dst
.regClass() == v1
) {
1273 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1274 bld
.vop2(aco_opcode::v_max_i32
, Definition(dst
), src
, bld
.vsub32(bld
.def(v1
), Operand(0u), src
));
1276 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1280 case nir_op_isign
: {
1281 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1282 if (dst
.regClass() == s1
) {
1283 Temp tmp
= bld
.sop2(aco_opcode::s_max_i32
, bld
.def(s1
), bld
.def(s1
, scc
), src
, Operand((uint32_t)-1));
1284 bld
.sop2(aco_opcode::s_min_i32
, Definition(dst
), bld
.def(s1
, scc
), tmp
, Operand(1u));
1285 } else if (dst
.regClass() == s2
) {
1286 Temp neg
= bld
.sop2(aco_opcode::s_ashr_i64
, bld
.def(s2
), bld
.def(s1
, scc
), src
, Operand(63u));
1288 if (ctx
->program
->chip_class
>= GFX8
)
1289 neqz
= bld
.sopc(aco_opcode::s_cmp_lg_u64
, bld
.def(s1
, scc
), src
, Operand(0u));
1291 neqz
= bld
.sop2(aco_opcode::s_or_b64
, bld
.def(s2
), bld
.def(s1
, scc
), src
, Operand(0u)).def(1).getTemp();
1292 /* SCC gets zero-extended to 64 bit */
1293 bld
.sop2(aco_opcode::s_or_b64
, Definition(dst
), bld
.def(s1
, scc
), neg
, bld
.scc(neqz
));
1294 } else if (dst
.regClass() == v1
) {
1295 bld
.vop3(aco_opcode::v_med3_i32
, Definition(dst
), Operand((uint32_t)-1), src
, Operand(1u));
1296 } else if (dst
.regClass() == v2
) {
1297 Temp upper
= emit_extract_vector(ctx
, src
, 1, v1
);
1298 Temp neg
= bld
.vop2(aco_opcode::v_ashrrev_i32
, bld
.def(v1
), Operand(31u), upper
);
1299 Temp gtz
= bld
.vopc(aco_opcode::v_cmp_ge_i64
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
1300 Temp lower
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(1u), neg
, gtz
);
1301 upper
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(0u), neg
, gtz
);
1302 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1304 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1309 if (dst
.regClass() == v1
) {
1310 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_max_i32
, dst
, true);
1311 } else if (dst
.regClass() == s1
) {
1312 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_max_i32
, dst
, true);
1314 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1319 if (dst
.regClass() == v1
) {
1320 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_max_u32
, dst
, true);
1321 } else if (dst
.regClass() == s1
) {
1322 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_max_u32
, dst
, true);
1324 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1329 if (dst
.regClass() == v1
) {
1330 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_min_i32
, dst
, true);
1331 } else if (dst
.regClass() == s1
) {
1332 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_min_i32
, dst
, true);
1334 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1339 if (dst
.regClass() == v1
) {
1340 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_min_u32
, dst
, true);
1341 } else if (dst
.regClass() == s1
) {
1342 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_min_u32
, dst
, true);
1344 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1349 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
1350 emit_boolean_logic(ctx
, instr
, Builder::s_or
, dst
);
1351 } else if (dst
.regClass() == v1
) {
1352 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_or_b32
, dst
, true);
1353 } else if (dst
.regClass() == v2
) {
1354 emit_vop2_instruction_logic64(ctx
, instr
, aco_opcode::v_or_b32
, dst
);
1355 } else if (dst
.regClass() == s1
) {
1356 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_or_b32
, dst
, true);
1357 } else if (dst
.regClass() == s2
) {
1358 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_or_b64
, dst
, true);
1360 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1365 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
1366 emit_boolean_logic(ctx
, instr
, Builder::s_and
, dst
);
1367 } else if (dst
.regClass() == v1
) {
1368 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_and_b32
, dst
, true);
1369 } else if (dst
.regClass() == v2
) {
1370 emit_vop2_instruction_logic64(ctx
, instr
, aco_opcode::v_and_b32
, dst
);
1371 } else if (dst
.regClass() == s1
) {
1372 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_and_b32
, dst
, true);
1373 } else if (dst
.regClass() == s2
) {
1374 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_and_b64
, dst
, true);
1376 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1381 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
1382 emit_boolean_logic(ctx
, instr
, Builder::s_xor
, dst
);
1383 } else if (dst
.regClass() == v1
) {
1384 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_xor_b32
, dst
, true);
1385 } else if (dst
.regClass() == v2
) {
1386 emit_vop2_instruction_logic64(ctx
, instr
, aco_opcode::v_xor_b32
, dst
);
1387 } else if (dst
.regClass() == s1
) {
1388 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_xor_b32
, dst
, true);
1389 } else if (dst
.regClass() == s2
) {
1390 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_xor_b64
, dst
, true);
1392 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1397 if (dst
.regClass() == v1
) {
1398 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_lshrrev_b32
, dst
, false, true);
1399 } else if (dst
.regClass() == v2
&& ctx
->program
->chip_class
>= GFX8
) {
1400 bld
.vop3(aco_opcode::v_lshrrev_b64
, Definition(dst
),
1401 get_alu_src(ctx
, instr
->src
[1]), get_alu_src(ctx
, instr
->src
[0]));
1402 } else if (dst
.regClass() == v2
) {
1403 bld
.vop3(aco_opcode::v_lshr_b64
, Definition(dst
),
1404 get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1405 } else if (dst
.regClass() == s2
) {
1406 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_lshr_b64
, dst
, true);
1407 } else if (dst
.regClass() == s1
) {
1408 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_lshr_b32
, dst
, true);
1410 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1415 if (dst
.regClass() == v1
) {
1416 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_lshlrev_b32
, dst
, false, true);
1417 } else if (dst
.regClass() == v2
&& ctx
->program
->chip_class
>= GFX8
) {
1418 bld
.vop3(aco_opcode::v_lshlrev_b64
, Definition(dst
),
1419 get_alu_src(ctx
, instr
->src
[1]), get_alu_src(ctx
, instr
->src
[0]));
1420 } else if (dst
.regClass() == v2
) {
1421 bld
.vop3(aco_opcode::v_lshl_b64
, Definition(dst
),
1422 get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1423 } else if (dst
.regClass() == s1
) {
1424 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_lshl_b32
, dst
, true);
1425 } else if (dst
.regClass() == s2
) {
1426 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_lshl_b64
, dst
, true);
1428 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1433 if (dst
.regClass() == v1
) {
1434 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_ashrrev_i32
, dst
, false, true);
1435 } else if (dst
.regClass() == v2
&& ctx
->program
->chip_class
>= GFX8
) {
1436 bld
.vop3(aco_opcode::v_ashrrev_i64
, Definition(dst
),
1437 get_alu_src(ctx
, instr
->src
[1]), get_alu_src(ctx
, instr
->src
[0]));
1438 } else if (dst
.regClass() == v2
) {
1439 bld
.vop3(aco_opcode::v_ashr_i64
, Definition(dst
),
1440 get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1441 } else if (dst
.regClass() == s1
) {
1442 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_ashr_i32
, dst
, true);
1443 } else if (dst
.regClass() == s2
) {
1444 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_ashr_i64
, dst
, true);
1446 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1450 case nir_op_find_lsb
: {
1451 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1452 if (src
.regClass() == s1
) {
1453 bld
.sop1(aco_opcode::s_ff1_i32_b32
, Definition(dst
), src
);
1454 } else if (src
.regClass() == v1
) {
1455 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_ffbl_b32
, dst
);
1456 } else if (src
.regClass() == s2
) {
1457 bld
.sop1(aco_opcode::s_ff1_i32_b64
, Definition(dst
), src
);
1459 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1463 case nir_op_ufind_msb
:
1464 case nir_op_ifind_msb
: {
1465 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1466 if (src
.regClass() == s1
|| src
.regClass() == s2
) {
1467 aco_opcode op
= src
.regClass() == s2
?
1468 (instr
->op
== nir_op_ufind_msb
? aco_opcode::s_flbit_i32_b64
: aco_opcode::s_flbit_i32_i64
) :
1469 (instr
->op
== nir_op_ufind_msb
? aco_opcode::s_flbit_i32_b32
: aco_opcode::s_flbit_i32
);
1470 Temp msb_rev
= bld
.sop1(op
, bld
.def(s1
), src
);
1472 Builder::Result sub
= bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.def(s1
, scc
),
1473 Operand(src
.size() * 32u - 1u), msb_rev
);
1474 Temp msb
= sub
.def(0).getTemp();
1475 Temp carry
= sub
.def(1).getTemp();
1477 bld
.sop2(aco_opcode::s_cselect_b32
, Definition(dst
), Operand((uint32_t)-1), msb
, bld
.scc(carry
));
1478 } else if (src
.regClass() == v1
) {
1479 aco_opcode op
= instr
->op
== nir_op_ufind_msb
? aco_opcode::v_ffbh_u32
: aco_opcode::v_ffbh_i32
;
1480 Temp msb_rev
= bld
.tmp(v1
);
1481 emit_vop1_instruction(ctx
, instr
, op
, msb_rev
);
1482 Temp msb
= bld
.tmp(v1
);
1483 Temp carry
= bld
.vsub32(Definition(msb
), Operand(31u), Operand(msb_rev
), true).def(1).getTemp();
1484 bld
.vop2_e64(aco_opcode::v_cndmask_b32
, Definition(dst
), msb
, Operand((uint32_t)-1), carry
);
1486 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1490 case nir_op_bitfield_reverse
: {
1491 if (dst
.regClass() == s1
) {
1492 bld
.sop1(aco_opcode::s_brev_b32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]));
1493 } else if (dst
.regClass() == v1
) {
1494 bld
.vop1(aco_opcode::v_bfrev_b32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]));
1496 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1501 if (dst
.regClass() == s1
) {
1502 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_add_u32
, dst
, true);
1506 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1507 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1508 if (dst
.regClass() == v1
) {
1509 bld
.vadd32(Definition(dst
), Operand(src0
), Operand(src1
));
1513 assert(src0
.size() == 2 && src1
.size() == 2);
1514 Temp src00
= bld
.tmp(src0
.type(), 1);
1515 Temp src01
= bld
.tmp(dst
.type(), 1);
1516 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
1517 Temp src10
= bld
.tmp(src1
.type(), 1);
1518 Temp src11
= bld
.tmp(dst
.type(), 1);
1519 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
1521 if (dst
.regClass() == s2
) {
1522 Temp carry
= bld
.tmp(s1
);
1523 Temp dst0
= bld
.sop2(aco_opcode::s_add_u32
, bld
.def(s1
), bld
.scc(Definition(carry
)), src00
, src10
);
1524 Temp dst1
= bld
.sop2(aco_opcode::s_addc_u32
, bld
.def(s1
), bld
.def(s1
, scc
), src01
, src11
, bld
.scc(carry
));
1525 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
1526 } else if (dst
.regClass() == v2
) {
1527 Temp dst0
= bld
.tmp(v1
);
1528 Temp carry
= bld
.vadd32(Definition(dst0
), src00
, src10
, true).def(1).getTemp();
1529 Temp dst1
= bld
.vadd32(bld
.def(v1
), src01
, src11
, false, carry
);
1530 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
1532 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1536 case nir_op_uadd_sat
: {
1537 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1538 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1539 if (dst
.regClass() == s1
) {
1540 Temp tmp
= bld
.tmp(s1
), carry
= bld
.tmp(s1
);
1541 bld
.sop2(aco_opcode::s_add_u32
, Definition(tmp
), bld
.scc(Definition(carry
)),
1543 bld
.sop2(aco_opcode::s_cselect_b32
, Definition(dst
), Operand((uint32_t) -1), tmp
, bld
.scc(carry
));
1544 } else if (dst
.regClass() == v1
) {
1545 if (ctx
->options
->chip_class
>= GFX9
) {
1546 aco_ptr
<VOP3A_instruction
> add
{create_instruction
<VOP3A_instruction
>(aco_opcode::v_add_u32
, asVOP3(Format::VOP2
), 2, 1)};
1547 add
->operands
[0] = Operand(src0
);
1548 add
->operands
[1] = Operand(src1
);
1549 add
->definitions
[0] = Definition(dst
);
1551 ctx
->block
->instructions
.emplace_back(std::move(add
));
1553 if (src1
.regClass() != v1
)
1554 std::swap(src0
, src1
);
1555 assert(src1
.regClass() == v1
);
1556 Temp tmp
= bld
.tmp(v1
);
1557 Temp carry
= bld
.vadd32(Definition(tmp
), src0
, src1
, true).def(1).getTemp();
1558 bld
.vop2_e64(aco_opcode::v_cndmask_b32
, Definition(dst
), tmp
, Operand((uint32_t) -1), carry
);
1561 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1565 case nir_op_uadd_carry
: {
1566 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1567 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1568 if (dst
.regClass() == s1
) {
1569 bld
.sop2(aco_opcode::s_add_u32
, bld
.def(s1
), bld
.scc(Definition(dst
)), src0
, src1
);
1572 if (dst
.regClass() == v1
) {
1573 Temp carry
= bld
.vadd32(bld
.def(v1
), src0
, src1
, true).def(1).getTemp();
1574 bld
.vop2_e64(aco_opcode::v_cndmask_b32
, Definition(dst
), Operand(0u), Operand(1u), carry
);
1578 Temp src00
= bld
.tmp(src0
.type(), 1);
1579 Temp src01
= bld
.tmp(dst
.type(), 1);
1580 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
1581 Temp src10
= bld
.tmp(src1
.type(), 1);
1582 Temp src11
= bld
.tmp(dst
.type(), 1);
1583 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
1584 if (dst
.regClass() == s2
) {
1585 Temp carry
= bld
.tmp(s1
);
1586 bld
.sop2(aco_opcode::s_add_u32
, bld
.def(s1
), bld
.scc(Definition(carry
)), src00
, src10
);
1587 carry
= bld
.sop2(aco_opcode::s_addc_u32
, bld
.def(s1
), bld
.scc(bld
.def(s1
)), src01
, src11
, bld
.scc(carry
)).def(1).getTemp();
1588 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), carry
, Operand(0u));
1589 } else if (dst
.regClass() == v2
) {
1590 Temp carry
= bld
.vadd32(bld
.def(v1
), src00
, src10
, true).def(1).getTemp();
1591 carry
= bld
.vadd32(bld
.def(v1
), src01
, src11
, true, carry
).def(1).getTemp();
1592 carry
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(0u), Operand(1u), carry
);
1593 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), carry
, Operand(0u));
1595 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1600 if (dst
.regClass() == s1
) {
1601 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_sub_i32
, dst
, true);
1605 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1606 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1607 if (dst
.regClass() == v1
) {
1608 bld
.vsub32(Definition(dst
), src0
, src1
);
1612 Temp src00
= bld
.tmp(src0
.type(), 1);
1613 Temp src01
= bld
.tmp(dst
.type(), 1);
1614 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
1615 Temp src10
= bld
.tmp(src1
.type(), 1);
1616 Temp src11
= bld
.tmp(dst
.type(), 1);
1617 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
1618 if (dst
.regClass() == s2
) {
1619 Temp carry
= bld
.tmp(s1
);
1620 Temp dst0
= bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(carry
)), src00
, src10
);
1621 Temp dst1
= bld
.sop2(aco_opcode::s_subb_u32
, bld
.def(s1
), bld
.def(s1
, scc
), src01
, src11
, carry
);
1622 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
1623 } else if (dst
.regClass() == v2
) {
1624 Temp lower
= bld
.tmp(v1
);
1625 Temp borrow
= bld
.vsub32(Definition(lower
), src00
, src10
, true).def(1).getTemp();
1626 Temp upper
= bld
.vsub32(bld
.def(v1
), src01
, src11
, false, borrow
);
1627 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1629 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1633 case nir_op_usub_borrow
: {
1634 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1635 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1636 if (dst
.regClass() == s1
) {
1637 bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(dst
)), src0
, src1
);
1639 } else if (dst
.regClass() == v1
) {
1640 Temp borrow
= bld
.vsub32(bld
.def(v1
), src0
, src1
, true).def(1).getTemp();
1641 bld
.vop2_e64(aco_opcode::v_cndmask_b32
, Definition(dst
), Operand(0u), Operand(1u), borrow
);
1645 Temp src00
= bld
.tmp(src0
.type(), 1);
1646 Temp src01
= bld
.tmp(dst
.type(), 1);
1647 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
1648 Temp src10
= bld
.tmp(src1
.type(), 1);
1649 Temp src11
= bld
.tmp(dst
.type(), 1);
1650 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
1651 if (dst
.regClass() == s2
) {
1652 Temp borrow
= bld
.tmp(s1
);
1653 bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(borrow
)), src00
, src10
);
1654 borrow
= bld
.sop2(aco_opcode::s_subb_u32
, bld
.def(s1
), bld
.scc(bld
.def(s1
)), src01
, src11
, bld
.scc(borrow
)).def(1).getTemp();
1655 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), borrow
, Operand(0u));
1656 } else if (dst
.regClass() == v2
) {
1657 Temp borrow
= bld
.vsub32(bld
.def(v1
), src00
, src10
, true).def(1).getTemp();
1658 borrow
= bld
.vsub32(bld
.def(v1
), src01
, src11
, true, Operand(borrow
)).def(1).getTemp();
1659 borrow
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(0u), Operand(1u), borrow
);
1660 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), borrow
, Operand(0u));
1662 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1667 if (dst
.regClass() == v1
) {
1668 bld
.vop3(aco_opcode::v_mul_lo_u32
, Definition(dst
),
1669 get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1670 } else if (dst
.regClass() == s1
) {
1671 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_mul_i32
, dst
, false);
1673 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1677 case nir_op_umul_high
: {
1678 if (dst
.regClass() == v1
) {
1679 bld
.vop3(aco_opcode::v_mul_hi_u32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1680 } else if (dst
.regClass() == s1
&& ctx
->options
->chip_class
>= GFX9
) {
1681 bld
.sop2(aco_opcode::s_mul_hi_u32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1682 } else if (dst
.regClass() == s1
) {
1683 Temp tmp
= bld
.vop3(aco_opcode::v_mul_hi_u32
, bld
.def(v1
), get_alu_src(ctx
, instr
->src
[0]),
1684 as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1])));
1685 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), tmp
);
1687 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1691 case nir_op_imul_high
: {
1692 if (dst
.regClass() == v1
) {
1693 bld
.vop3(aco_opcode::v_mul_hi_i32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1694 } else if (dst
.regClass() == s1
&& ctx
->options
->chip_class
>= GFX9
) {
1695 bld
.sop2(aco_opcode::s_mul_hi_i32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1696 } else if (dst
.regClass() == s1
) {
1697 Temp tmp
= bld
.vop3(aco_opcode::v_mul_hi_i32
, bld
.def(v1
), get_alu_src(ctx
, instr
->src
[0]),
1698 as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1])));
1699 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), tmp
);
1701 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1706 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1707 Temp src1
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1]));
1708 if (dst
.regClass() == v2b
) {
1709 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_mul_f16
, dst
, true);
1710 } else if (dst
.regClass() == v1
) {
1711 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_mul_f32
, dst
, true);
1712 } else if (dst
.regClass() == v2
) {
1713 bld
.vop3(aco_opcode::v_mul_f64
, Definition(dst
), src0
, src1
);
1715 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1720 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1721 Temp src1
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1]));
1722 if (dst
.regClass() == v2b
) {
1723 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_add_f16
, dst
, true);
1724 } else if (dst
.regClass() == v1
) {
1725 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_add_f32
, dst
, true);
1726 } else if (dst
.regClass() == v2
) {
1727 bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), src0
, src1
);
1729 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1734 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1735 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1736 if (dst
.regClass() == v2b
) {
1737 if (src1
.type() == RegType::vgpr
|| src0
.type() != RegType::vgpr
)
1738 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_sub_f16
, dst
, false);
1740 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_subrev_f16
, dst
, true);
1741 } else if (dst
.regClass() == v1
) {
1742 if (src1
.type() == RegType::vgpr
|| src0
.type() != RegType::vgpr
)
1743 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_sub_f32
, dst
, false);
1745 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_subrev_f32
, dst
, true);
1746 } else if (dst
.regClass() == v2
) {
1747 Instruction
* add
= bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
),
1748 as_vgpr(ctx
, src0
), as_vgpr(ctx
, src1
));
1749 VOP3A_instruction
* sub
= static_cast<VOP3A_instruction
*>(add
);
1752 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1757 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1758 Temp src1
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1]));
1759 if (dst
.regClass() == v2b
) {
1760 // TODO: check fp_mode.must_flush_denorms16_64
1761 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_max_f16
, dst
, true);
1762 } else if (dst
.regClass() == v1
) {
1763 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_max_f32
, dst
, true, false, ctx
->block
->fp_mode
.must_flush_denorms32
);
1764 } else if (dst
.regClass() == v2
) {
1765 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
&& ctx
->program
->chip_class
< GFX9
) {
1766 Temp tmp
= bld
.vop3(aco_opcode::v_max_f64
, bld
.def(v2
), src0
, src1
);
1767 bld
.vop3(aco_opcode::v_mul_f64
, Definition(dst
), Operand(0x3FF0000000000000lu
), tmp
);
1769 bld
.vop3(aco_opcode::v_max_f64
, Definition(dst
), src0
, src1
);
1772 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1777 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1778 Temp src1
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1]));
1779 if (dst
.regClass() == v2b
) {
1780 // TODO: check fp_mode.must_flush_denorms16_64
1781 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_min_f16
, dst
, true);
1782 } else if (dst
.regClass() == v1
) {
1783 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_min_f32
, dst
, true, false, ctx
->block
->fp_mode
.must_flush_denorms32
);
1784 } else if (dst
.regClass() == v2
) {
1785 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
&& ctx
->program
->chip_class
< GFX9
) {
1786 Temp tmp
= bld
.vop3(aco_opcode::v_min_f64
, bld
.def(v2
), src0
, src1
);
1787 bld
.vop3(aco_opcode::v_mul_f64
, Definition(dst
), Operand(0x3FF0000000000000lu
), tmp
);
1789 bld
.vop3(aco_opcode::v_min_f64
, Definition(dst
), src0
, src1
);
1792 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1796 case nir_op_cube_face_coord
: {
1797 Temp in
= get_alu_src(ctx
, instr
->src
[0], 3);
1798 Temp src
[3] = { emit_extract_vector(ctx
, in
, 0, v1
),
1799 emit_extract_vector(ctx
, in
, 1, v1
),
1800 emit_extract_vector(ctx
, in
, 2, v1
) };
1801 Temp ma
= bld
.vop3(aco_opcode::v_cubema_f32
, bld
.def(v1
), src
[0], src
[1], src
[2]);
1802 ma
= bld
.vop1(aco_opcode::v_rcp_f32
, bld
.def(v1
), ma
);
1803 Temp sc
= bld
.vop3(aco_opcode::v_cubesc_f32
, bld
.def(v1
), src
[0], src
[1], src
[2]);
1804 Temp tc
= bld
.vop3(aco_opcode::v_cubetc_f32
, bld
.def(v1
), src
[0], src
[1], src
[2]);
1805 sc
= bld
.vop2(aco_opcode::v_add_f32
, bld
.def(v1
),
1806 bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), sc
, ma
), Operand(0x3f000000u
/*0.5*/));
1807 tc
= bld
.vop2(aco_opcode::v_add_f32
, bld
.def(v1
),
1808 bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), tc
, ma
), Operand(0x3f000000u
/*0.5*/));
1809 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), sc
, tc
);
1812 case nir_op_cube_face_index
: {
1813 Temp in
= get_alu_src(ctx
, instr
->src
[0], 3);
1814 Temp src
[3] = { emit_extract_vector(ctx
, in
, 0, v1
),
1815 emit_extract_vector(ctx
, in
, 1, v1
),
1816 emit_extract_vector(ctx
, in
, 2, v1
) };
1817 bld
.vop3(aco_opcode::v_cubeid_f32
, Definition(dst
), src
[0], src
[1], src
[2]);
1820 case nir_op_bcsel
: {
1821 emit_bcsel(ctx
, instr
, dst
);
1825 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1826 if (dst
.regClass() == v2b
) {
1827 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rsq_f16
, dst
);
1828 } else if (dst
.regClass() == v1
) {
1829 emit_rsq(ctx
, bld
, Definition(dst
), src
);
1830 } else if (dst
.regClass() == v2
) {
1831 /* Lowered at NIR level for precision reasons. */
1832 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rsq_f64
, dst
);
1834 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1839 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1840 if (dst
.regClass() == v2b
) {
1841 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
)
1842 src
= bld
.vop2(aco_opcode::v_mul_f16
, bld
.def(v2b
), Operand((uint16_t)0x3C00), as_vgpr(ctx
, src
));
1843 bld
.vop2(aco_opcode::v_xor_b32
, Definition(dst
), Operand(0x8000u
), as_vgpr(ctx
, src
));
1844 } else if (dst
.regClass() == v1
) {
1845 if (ctx
->block
->fp_mode
.must_flush_denorms32
)
1846 src
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), Operand(0x3f800000u
), as_vgpr(ctx
, src
));
1847 bld
.vop2(aco_opcode::v_xor_b32
, Definition(dst
), Operand(0x80000000u
), as_vgpr(ctx
, src
));
1848 } else if (dst
.regClass() == v2
) {
1849 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
)
1850 src
= bld
.vop3(aco_opcode::v_mul_f64
, bld
.def(v2
), Operand(0x3FF0000000000000lu
), as_vgpr(ctx
, src
));
1851 Temp upper
= bld
.tmp(v1
), lower
= bld
.tmp(v1
);
1852 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), src
);
1853 upper
= bld
.vop2(aco_opcode::v_xor_b32
, bld
.def(v1
), Operand(0x80000000u
), upper
);
1854 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1856 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1861 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1862 if (dst
.regClass() == v2b
) {
1863 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
)
1864 src
= bld
.vop2(aco_opcode::v_mul_f16
, bld
.def(v2b
), Operand((uint16_t)0x3C00), as_vgpr(ctx
, src
));
1865 bld
.vop2(aco_opcode::v_and_b32
, Definition(dst
), Operand(0x7FFFu
), as_vgpr(ctx
, src
));
1866 } else if (dst
.regClass() == v1
) {
1867 if (ctx
->block
->fp_mode
.must_flush_denorms32
)
1868 src
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), Operand(0x3f800000u
), as_vgpr(ctx
, src
));
1869 bld
.vop2(aco_opcode::v_and_b32
, Definition(dst
), Operand(0x7FFFFFFFu
), as_vgpr(ctx
, src
));
1870 } else if (dst
.regClass() == v2
) {
1871 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
)
1872 src
= bld
.vop3(aco_opcode::v_mul_f64
, bld
.def(v2
), Operand(0x3FF0000000000000lu
), as_vgpr(ctx
, src
));
1873 Temp upper
= bld
.tmp(v1
), lower
= bld
.tmp(v1
);
1874 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), src
);
1875 upper
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), Operand(0x7FFFFFFFu
), upper
);
1876 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1878 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1883 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1884 if (dst
.regClass() == v2b
) {
1885 bld
.vop3(aco_opcode::v_med3_f16
, Definition(dst
), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src
);
1886 } else if (dst
.regClass() == v1
) {
1887 bld
.vop3(aco_opcode::v_med3_f32
, Definition(dst
), Operand(0u), Operand(0x3f800000u
), src
);
1888 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
1889 // TODO: confirm that this holds under any circumstances
1890 } else if (dst
.regClass() == v2
) {
1891 Instruction
* add
= bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), src
, Operand(0u));
1892 VOP3A_instruction
* vop3
= static_cast<VOP3A_instruction
*>(add
);
1895 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1899 case nir_op_flog2
: {
1900 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1901 if (dst
.regClass() == v2b
) {
1902 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_log_f16
, dst
);
1903 } else if (dst
.regClass() == v1
) {
1904 emit_log2(ctx
, bld
, Definition(dst
), src
);
1906 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1911 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1912 if (dst
.regClass() == v2b
) {
1913 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rcp_f16
, dst
);
1914 } else if (dst
.regClass() == v1
) {
1915 emit_rcp(ctx
, bld
, Definition(dst
), src
);
1916 } else if (dst
.regClass() == v2
) {
1917 /* Lowered at NIR level for precision reasons. */
1918 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rcp_f64
, dst
);
1920 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1924 case nir_op_fexp2
: {
1925 if (dst
.regClass() == v2b
) {
1926 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_exp_f16
, dst
);
1927 } else if (dst
.regClass() == v1
) {
1928 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_exp_f32
, dst
);
1930 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1934 case nir_op_fsqrt
: {
1935 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1936 if (dst
.regClass() == v2b
) {
1937 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_sqrt_f16
, dst
);
1938 } else if (dst
.regClass() == v1
) {
1939 emit_sqrt(ctx
, bld
, Definition(dst
), src
);
1940 } else if (dst
.regClass() == v2
) {
1941 /* Lowered at NIR level for precision reasons. */
1942 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_sqrt_f64
, dst
);
1944 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1948 case nir_op_ffract
: {
1949 if (dst
.regClass() == v2b
) {
1950 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_fract_f16
, dst
);
1951 } else if (dst
.regClass() == v1
) {
1952 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_fract_f32
, dst
);
1953 } else if (dst
.regClass() == v2
) {
1954 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_fract_f64
, dst
);
1956 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1960 case nir_op_ffloor
: {
1961 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1962 if (dst
.regClass() == v2b
) {
1963 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_floor_f16
, dst
);
1964 } else if (dst
.regClass() == v1
) {
1965 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_floor_f32
, dst
);
1966 } else if (dst
.regClass() == v2
) {
1967 emit_floor_f64(ctx
, bld
, Definition(dst
), src
);
1969 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
1973 case nir_op_fceil
: {
1974 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1975 if (dst
.regClass() == v2b
) {
1976 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_ceil_f16
, dst
);
1977 } else if (dst
.regClass() == v1
) {
1978 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_ceil_f32
, dst
);
1979 } else if (dst
.regClass() == v2
) {
1980 if (ctx
->options
->chip_class
>= GFX7
) {
1981 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_ceil_f64
, dst
);
1983 /* GFX6 doesn't support V_CEIL_F64, lower it. */
1984 /* trunc = trunc(src0)
1985 * if (src0 > 0.0 && src0 != trunc)
1988 Temp trunc
= emit_trunc_f64(ctx
, bld
, bld
.def(v2
), src0
);
1989 Temp tmp0
= bld
.vopc_e64(aco_opcode::v_cmp_gt_f64
, bld
.def(bld
.lm
), src0
, Operand(0u));
1990 Temp tmp1
= bld
.vopc(aco_opcode::v_cmp_lg_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), src0
, trunc
);
1991 Temp cond
= bld
.sop2(aco_opcode::s_and_b64
, bld
.hint_vcc(bld
.def(s2
)), bld
.def(s1
, scc
), tmp0
, tmp1
);
1992 Temp add
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), bld
.copy(bld
.def(v1
), Operand(0u)), bld
.copy(bld
.def(v1
), Operand(0x3ff00000u
)), cond
);
1993 add
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), bld
.copy(bld
.def(v1
), Operand(0u)), add
);
1994 bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), trunc
, add
);
1997 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2001 case nir_op_ftrunc
: {
2002 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2003 if (dst
.regClass() == v2b
) {
2004 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_trunc_f16
, dst
);
2005 } else if (dst
.regClass() == v1
) {
2006 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_trunc_f32
, dst
);
2007 } else if (dst
.regClass() == v2
) {
2008 emit_trunc_f64(ctx
, bld
, Definition(dst
), src
);
2010 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2014 case nir_op_fround_even
: {
2015 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
2016 if (dst
.regClass() == v2b
) {
2017 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rndne_f16
, dst
);
2018 } else if (dst
.regClass() == v1
) {
2019 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rndne_f32
, dst
);
2020 } else if (dst
.regClass() == v2
) {
2021 if (ctx
->options
->chip_class
>= GFX7
) {
2022 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rndne_f64
, dst
);
2024 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2025 Temp src0_lo
= bld
.tmp(v1
), src0_hi
= bld
.tmp(v1
);
2026 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src0_lo
), Definition(src0_hi
), src0
);
2028 Temp bitmask
= bld
.sop1(aco_opcode::s_brev_b32
, bld
.def(s1
), bld
.copy(bld
.def(s1
), Operand(-2u)));
2029 Temp bfi
= bld
.vop3(aco_opcode::v_bfi_b32
, bld
.def(v1
), bitmask
, bld
.copy(bld
.def(v1
), Operand(0x43300000u
)), as_vgpr(ctx
, src0_hi
));
2030 Temp tmp
= bld
.vop3(aco_opcode::v_add_f64
, bld
.def(v2
), src0
, bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(0u), bfi
));
2031 Instruction
*sub
= bld
.vop3(aco_opcode::v_add_f64
, bld
.def(v2
), tmp
, bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(0u), bfi
));
2032 static_cast<VOP3A_instruction
*>(sub
)->neg
[1] = true;
2033 tmp
= sub
->definitions
[0].getTemp();
2035 Temp v
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(-1u), Operand(0x432fffffu
));
2036 Instruction
* vop3
= bld
.vopc_e64(aco_opcode::v_cmp_gt_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), src0
, v
);
2037 static_cast<VOP3A_instruction
*>(vop3
)->abs
[0] = true;
2038 Temp cond
= vop3
->definitions
[0].getTemp();
2040 Temp tmp_lo
= bld
.tmp(v1
), tmp_hi
= bld
.tmp(v1
);
2041 bld
.pseudo(aco_opcode::p_split_vector
, Definition(tmp_lo
), Definition(tmp_hi
), tmp
);
2042 Temp dst0
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), tmp_lo
, as_vgpr(ctx
, src0_lo
), cond
);
2043 Temp dst1
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), tmp_hi
, as_vgpr(ctx
, src0_hi
), cond
);
2045 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
2048 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2054 Temp src
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[0]));
2055 aco_ptr
<Instruction
> norm
;
2056 if (dst
.regClass() == v2b
) {
2057 Temp half_pi
= bld
.copy(bld
.def(s1
), Operand(0x3118u
));
2058 Temp tmp
= bld
.vop2(aco_opcode::v_mul_f16
, bld
.def(v1
), half_pi
, src
);
2059 aco_opcode opcode
= instr
->op
== nir_op_fsin
? aco_opcode::v_sin_f16
: aco_opcode::v_cos_f16
;
2060 bld
.vop1(opcode
, Definition(dst
), tmp
);
2061 } else if (dst
.regClass() == v1
) {
2062 Temp half_pi
= bld
.copy(bld
.def(s1
), Operand(0x3e22f983u
));
2063 Temp tmp
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), half_pi
, src
);
2065 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2066 if (ctx
->options
->chip_class
< GFX9
)
2067 tmp
= bld
.vop1(aco_opcode::v_fract_f32
, bld
.def(v1
), tmp
);
2069 aco_opcode opcode
= instr
->op
== nir_op_fsin
? aco_opcode::v_sin_f32
: aco_opcode::v_cos_f32
;
2070 bld
.vop1(opcode
, Definition(dst
), tmp
);
2072 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2076 case nir_op_ldexp
: {
2077 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
2078 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
2079 if (dst
.regClass() == v2b
) {
2080 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_ldexp_f16
, dst
, false);
2081 } else if (dst
.regClass() == v1
) {
2082 bld
.vop3(aco_opcode::v_ldexp_f32
, Definition(dst
), as_vgpr(ctx
, src0
), src1
);
2083 } else if (dst
.regClass() == v2
) {
2084 bld
.vop3(aco_opcode::v_ldexp_f64
, Definition(dst
), as_vgpr(ctx
, src0
), src1
);
2086 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2090 case nir_op_frexp_sig
: {
2091 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2092 if (dst
.regClass() == v2b
) {
2093 bld
.vop1(aco_opcode::v_frexp_mant_f16
, Definition(dst
), src
);
2094 } else if (dst
.regClass() == v1
) {
2095 bld
.vop1(aco_opcode::v_frexp_mant_f32
, Definition(dst
), src
);
2096 } else if (dst
.regClass() == v2
) {
2097 bld
.vop1(aco_opcode::v_frexp_mant_f64
, Definition(dst
), src
);
2099 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2103 case nir_op_frexp_exp
: {
2104 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2105 if (instr
->src
[0].src
.ssa
->bit_size
== 16) {
2106 Temp tmp
= bld
.vop1(aco_opcode::v_frexp_exp_i16_f16
, bld
.def(v1
), src
);
2107 tmp
= bld
.pseudo(aco_opcode::p_extract_vector
, bld
.def(v1b
), tmp
, Operand(0u));
2108 convert_int(ctx
, bld
, tmp
, 8, 32, true, dst
);
2109 } else if (instr
->src
[0].src
.ssa
->bit_size
== 32) {
2110 bld
.vop1(aco_opcode::v_frexp_exp_i32_f32
, Definition(dst
), src
);
2111 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2112 bld
.vop1(aco_opcode::v_frexp_exp_i32_f64
, Definition(dst
), src
);
2114 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2118 case nir_op_fsign
: {
2119 Temp src
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[0]));
2120 if (dst
.regClass() == v2b
) {
2121 Temp one
= bld
.copy(bld
.def(v1
), Operand(0x3c00u
));
2122 Temp minus_one
= bld
.copy(bld
.def(v1
), Operand(0xbc00u
));
2123 Temp cond
= bld
.vopc(aco_opcode::v_cmp_nlt_f16
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2124 src
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), one
, src
, cond
);
2125 cond
= bld
.vopc(aco_opcode::v_cmp_le_f16
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2126 bld
.vop2(aco_opcode::v_cndmask_b32
, Definition(dst
), minus_one
, src
, cond
);
2127 } else if (dst
.regClass() == v1
) {
2128 Temp cond
= bld
.vopc(aco_opcode::v_cmp_nlt_f32
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2129 src
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(0x3f800000u
), src
, cond
);
2130 cond
= bld
.vopc(aco_opcode::v_cmp_le_f32
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2131 bld
.vop2(aco_opcode::v_cndmask_b32
, Definition(dst
), Operand(0xbf800000u
), src
, cond
);
2132 } else if (dst
.regClass() == v2
) {
2133 Temp cond
= bld
.vopc(aco_opcode::v_cmp_nlt_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2134 Temp tmp
= bld
.vop1(aco_opcode::v_mov_b32
, bld
.def(v1
), Operand(0x3FF00000u
));
2135 Temp upper
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), tmp
, emit_extract_vector(ctx
, src
, 1, v1
), cond
);
2137 cond
= bld
.vopc(aco_opcode::v_cmp_le_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2138 tmp
= bld
.vop1(aco_opcode::v_mov_b32
, bld
.def(v1
), Operand(0xBFF00000u
));
2139 upper
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), tmp
, upper
, cond
);
2141 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), Operand(0u), upper
);
2143 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2148 case nir_op_f2f16_rtne
: {
2149 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2150 if (instr
->src
[0].src
.ssa
->bit_size
== 64)
2151 src
= bld
.vop1(aco_opcode::v_cvt_f32_f64
, bld
.def(v1
), src
);
2152 if (instr
->op
== nir_op_f2f16_rtne
&& ctx
->block
->fp_mode
.round16_64
!= fp_round_ne
)
2153 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2154 * keep value numbering and the scheduler simpler.
2156 bld
.vop1(aco_opcode::p_cvt_f16_f32_rtne
, Definition(dst
), src
);
2158 bld
.vop1(aco_opcode::v_cvt_f16_f32
, Definition(dst
), src
);
2161 case nir_op_f2f16_rtz
: {
2162 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2163 if (instr
->src
[0].src
.ssa
->bit_size
== 64)
2164 src
= bld
.vop1(aco_opcode::v_cvt_f32_f64
, bld
.def(v1
), src
);
2165 bld
.vop3(aco_opcode::v_cvt_pkrtz_f16_f32
, Definition(dst
), src
, Operand(0u));
2168 case nir_op_f2f32
: {
2169 if (instr
->src
[0].src
.ssa
->bit_size
== 16) {
2170 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_f32_f16
, dst
);
2171 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2172 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_f32_f64
, dst
);
2174 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2178 case nir_op_f2f64
: {
2179 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2180 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2181 src
= bld
.vop1(aco_opcode::v_cvt_f32_f16
, bld
.def(v1
), src
);
2182 bld
.vop1(aco_opcode::v_cvt_f64_f32
, Definition(dst
), src
);
2185 case nir_op_i2f16
: {
2186 assert(dst
.regClass() == v2b
);
2187 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2188 if (instr
->src
[0].src
.ssa
->bit_size
== 8)
2189 src
= convert_int(ctx
, bld
, src
, 8, 16, true);
2190 else if (instr
->src
[0].src
.ssa
->bit_size
== 64)
2191 src
= convert_int(ctx
, bld
, src
, 64, 32, false);
2192 bld
.vop1(aco_opcode::v_cvt_f16_i16
, Definition(dst
), src
);
2195 case nir_op_i2f32
: {
2196 assert(dst
.size() == 1);
2197 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2198 if (instr
->src
[0].src
.ssa
->bit_size
<= 16)
2199 src
= convert_int(ctx
, bld
, src
, instr
->src
[0].src
.ssa
->bit_size
, 32, true);
2200 bld
.vop1(aco_opcode::v_cvt_f32_i32
, Definition(dst
), src
);
2203 case nir_op_i2f64
: {
2204 if (instr
->src
[0].src
.ssa
->bit_size
<= 32) {
2205 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2206 if (instr
->src
[0].src
.ssa
->bit_size
<= 16)
2207 src
= convert_int(ctx
, bld
, src
, instr
->src
[0].src
.ssa
->bit_size
, 32, true);
2208 bld
.vop1(aco_opcode::v_cvt_f64_i32
, Definition(dst
), src
);
2209 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2210 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2211 RegClass rc
= RegClass(src
.type(), 1);
2212 Temp lower
= bld
.tmp(rc
), upper
= bld
.tmp(rc
);
2213 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), src
);
2214 lower
= bld
.vop1(aco_opcode::v_cvt_f64_u32
, bld
.def(v2
), lower
);
2215 upper
= bld
.vop1(aco_opcode::v_cvt_f64_i32
, bld
.def(v2
), upper
);
2216 upper
= bld
.vop3(aco_opcode::v_ldexp_f64
, bld
.def(v2
), upper
, Operand(32u));
2217 bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), lower
, upper
);
2220 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2224 case nir_op_u2f16
: {
2225 assert(dst
.regClass() == v2b
);
2226 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2227 if (instr
->src
[0].src
.ssa
->bit_size
== 8)
2228 src
= convert_int(ctx
, bld
, src
, 8, 16, false);
2229 else if (instr
->src
[0].src
.ssa
->bit_size
== 64)
2230 src
= convert_int(ctx
, bld
, src
, 64, 32, false);
2231 bld
.vop1(aco_opcode::v_cvt_f16_u16
, Definition(dst
), src
);
2234 case nir_op_u2f32
: {
2235 assert(dst
.size() == 1);
2236 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2237 if (instr
->src
[0].src
.ssa
->bit_size
== 8) {
2238 bld
.vop1(aco_opcode::v_cvt_f32_ubyte0
, Definition(dst
), src
);
2240 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2241 src
= convert_int(ctx
, bld
, src
, instr
->src
[0].src
.ssa
->bit_size
, 32, true);
2242 bld
.vop1(aco_opcode::v_cvt_f32_u32
, Definition(dst
), src
);
2246 case nir_op_u2f64
: {
2247 if (instr
->src
[0].src
.ssa
->bit_size
<= 32) {
2248 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2249 if (instr
->src
[0].src
.ssa
->bit_size
<= 16)
2250 src
= convert_int(ctx
, bld
, src
, instr
->src
[0].src
.ssa
->bit_size
, 32, false);
2251 bld
.vop1(aco_opcode::v_cvt_f64_u32
, Definition(dst
), src
);
2252 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2253 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2254 RegClass rc
= RegClass(src
.type(), 1);
2255 Temp lower
= bld
.tmp(rc
), upper
= bld
.tmp(rc
);
2256 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), src
);
2257 lower
= bld
.vop1(aco_opcode::v_cvt_f64_u32
, bld
.def(v2
), lower
);
2258 upper
= bld
.vop1(aco_opcode::v_cvt_f64_u32
, bld
.def(v2
), upper
);
2259 upper
= bld
.vop3(aco_opcode::v_ldexp_f64
, bld
.def(v2
), upper
, Operand(32u));
2260 bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), lower
, upper
);
2262 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2267 case nir_op_f2i16
: {
2268 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2269 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i16_f16
, dst
);
2270 else if (instr
->src
[0].src
.ssa
->bit_size
== 32)
2271 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i32_f32
, dst
);
2273 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i32_f64
, dst
);
2277 case nir_op_f2u16
: {
2278 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2279 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_u16_f16
, dst
);
2280 else if (instr
->src
[0].src
.ssa
->bit_size
== 32)
2281 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_u32_f32
, dst
);
2283 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_u32_f64
, dst
);
2286 case nir_op_f2i32
: {
2287 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2288 if (instr
->src
[0].src
.ssa
->bit_size
== 16) {
2289 Temp tmp
= bld
.vop1(aco_opcode::v_cvt_f32_f16
, bld
.def(v1
), src
);
2290 if (dst
.type() == RegType::vgpr
) {
2291 bld
.vop1(aco_opcode::v_cvt_i32_f32
, Definition(dst
), tmp
);
2293 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
),
2294 bld
.vop1(aco_opcode::v_cvt_i32_f32
, bld
.def(v1
), tmp
));
2296 } else if (instr
->src
[0].src
.ssa
->bit_size
== 32) {
2297 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i32_f32
, dst
);
2298 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2299 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i32_f64
, dst
);
2301 isel_err(&instr
->instr
, "Unimplemented NIR instr bit size");
2305 case nir_op_f2u32
: {
2306 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2307 if (instr
->src
[0].src
.ssa
->bit_size
== 16) {
2308 Temp tmp
= bld
.vop1(aco_opcode::v_cvt_f32_f16
, bld
.def(v1
), src
);
2309 if (dst
.type() == RegType::vgpr
) {
2310 bld
.vop1(aco_opcode::v_cvt_u32_f32
, Definition(dst
), tmp
);
2312 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
),
2313 bld
.vop1(aco_opcode::v_cvt_u32_f32
, bld
.def(v1
), tmp
));