2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 #include "ac_shader_util.h"
33 #include "aco_builder.h"
34 #include "aco_interface.h"
35 #include "aco_instruction_selection_setup.cpp"
36 #include "util/fast_idiv_by_const.h"
41 class loop_info_RAII
{
43 unsigned header_idx_old
;
45 bool divergent_cont_old
;
46 bool divergent_branch_old
;
47 bool divergent_if_old
;
50 loop_info_RAII(isel_context
* ctx
, unsigned loop_header_idx
, Block
* loop_exit
)
52 header_idx_old(ctx
->cf_info
.parent_loop
.header_idx
), exit_old(ctx
->cf_info
.parent_loop
.exit
),
53 divergent_cont_old(ctx
->cf_info
.parent_loop
.has_divergent_continue
),
54 divergent_branch_old(ctx
->cf_info
.parent_loop
.has_divergent_branch
),
55 divergent_if_old(ctx
->cf_info
.parent_if
.is_divergent
)
57 ctx
->cf_info
.parent_loop
.header_idx
= loop_header_idx
;
58 ctx
->cf_info
.parent_loop
.exit
= loop_exit
;
59 ctx
->cf_info
.parent_loop
.has_divergent_continue
= false;
60 ctx
->cf_info
.parent_loop
.has_divergent_branch
= false;
61 ctx
->cf_info
.parent_if
.is_divergent
= false;
62 ctx
->cf_info
.loop_nest_depth
= ctx
->cf_info
.loop_nest_depth
+ 1;
67 ctx
->cf_info
.parent_loop
.header_idx
= header_idx_old
;
68 ctx
->cf_info
.parent_loop
.exit
= exit_old
;
69 ctx
->cf_info
.parent_loop
.has_divergent_continue
= divergent_cont_old
;
70 ctx
->cf_info
.parent_loop
.has_divergent_branch
= divergent_branch_old
;
71 ctx
->cf_info
.parent_if
.is_divergent
= divergent_if_old
;
72 ctx
->cf_info
.loop_nest_depth
= ctx
->cf_info
.loop_nest_depth
- 1;
73 if (!ctx
->cf_info
.loop_nest_depth
&& !ctx
->cf_info
.parent_if
.is_divergent
)
74 ctx
->cf_info
.exec_potentially_empty_discard
= false;
82 bool exec_potentially_empty_discard_old
;
83 bool exec_potentially_empty_break_old
;
84 uint16_t exec_potentially_empty_break_depth_old
;
88 bool uniform_has_then_branch
;
89 bool then_branch_divergent
;
94 static bool visit_cf_list(struct isel_context
*ctx
,
95 struct exec_list
*list
);
97 static void add_logical_edge(unsigned pred_idx
, Block
*succ
)
99 succ
->logical_preds
.emplace_back(pred_idx
);
103 static void add_linear_edge(unsigned pred_idx
, Block
*succ
)
105 succ
->linear_preds
.emplace_back(pred_idx
);
108 static void add_edge(unsigned pred_idx
, Block
*succ
)
110 add_logical_edge(pred_idx
, succ
);
111 add_linear_edge(pred_idx
, succ
);
114 static void append_logical_start(Block
*b
)
116 Builder(NULL
, b
).pseudo(aco_opcode::p_logical_start
);
119 static void append_logical_end(Block
*b
)
121 Builder(NULL
, b
).pseudo(aco_opcode::p_logical_end
);
124 Temp
get_ssa_temp(struct isel_context
*ctx
, nir_ssa_def
*def
)
126 assert(ctx
->allocated
[def
->index
].id());
127 return ctx
->allocated
[def
->index
];
130 Temp
emit_mbcnt(isel_context
*ctx
, Definition dst
,
131 Operand mask_lo
= Operand((uint32_t) -1), Operand mask_hi
= Operand((uint32_t) -1))
133 Builder
bld(ctx
->program
, ctx
->block
);
134 Definition lo_def
= ctx
->program
->wave_size
== 32 ? dst
: bld
.def(v1
);
135 Temp thread_id_lo
= bld
.vop3(aco_opcode::v_mbcnt_lo_u32_b32
, lo_def
, mask_lo
, Operand(0u));
137 if (ctx
->program
->wave_size
== 32) {
139 } else if (ctx
->program
->chip_class
<= GFX7
) {
140 Temp thread_id_hi
= bld
.vop2(aco_opcode::v_mbcnt_hi_u32_b32
, dst
, mask_hi
, thread_id_lo
);
143 Temp thread_id_hi
= bld
.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64
, dst
, mask_hi
, thread_id_lo
);
148 Temp
emit_wqm(isel_context
*ctx
, Temp src
, Temp dst
=Temp(0, s1
), bool program_needs_wqm
= false)
150 Builder
bld(ctx
->program
, ctx
->block
);
153 dst
= bld
.tmp(src
.regClass());
155 assert(src
.size() == dst
.size());
157 if (ctx
->stage
!= fragment_fs
) {
161 bld
.copy(Definition(dst
), src
);
165 bld
.pseudo(aco_opcode::p_wqm
, Definition(dst
), src
);
166 ctx
->program
->needs_wqm
|= program_needs_wqm
;
170 static Temp
emit_bpermute(isel_context
*ctx
, Builder
&bld
, Temp index
, Temp data
)
172 if (index
.regClass() == s1
)
173 return bld
.readlane(bld
.def(s1
), data
, index
);
175 if (ctx
->options
->chip_class
<= GFX7
) {
176 /* GFX6-7: there is no bpermute instruction */
177 Operand
index_op(index
);
178 Operand
input_data(data
);
179 index_op
.setLateKill(true);
180 input_data
.setLateKill(true);
182 return bld
.pseudo(aco_opcode::p_bpermute
, bld
.def(v1
), bld
.def(bld
.lm
), bld
.def(bld
.lm
, vcc
), index_op
, input_data
);
183 } else if (ctx
->options
->chip_class
>= GFX10
&& ctx
->program
->wave_size
== 64) {
184 /* GFX10 wave64 mode: emulate full-wave bpermute */
185 if (!ctx
->has_gfx10_wave64_bpermute
) {
186 ctx
->has_gfx10_wave64_bpermute
= true;
187 ctx
->program
->config
->num_shared_vgprs
= 8; /* Shared VGPRs are allocated in groups of 8 */
188 ctx
->program
->vgpr_limit
-= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
191 Temp index_is_lo
= bld
.vopc(aco_opcode::v_cmp_ge_u32
, bld
.def(bld
.lm
), Operand(31u), index
);
192 Builder::Result index_is_lo_split
= bld
.pseudo(aco_opcode::p_split_vector
, bld
.def(s1
), bld
.def(s1
), index_is_lo
);
193 Temp index_is_lo_n1
= bld
.sop1(aco_opcode::s_not_b32
, bld
.def(s1
), bld
.def(s1
, scc
), index_is_lo_split
.def(1).getTemp());
194 Operand same_half
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(s2
), index_is_lo_split
.def(0).getTemp(), index_is_lo_n1
);
195 Operand index_x4
= bld
.vop2(aco_opcode::v_lshlrev_b32
, bld
.def(v1
), Operand(2u), index
);
196 Operand
input_data(data
);
198 index_x4
.setLateKill(true);
199 input_data
.setLateKill(true);
200 same_half
.setLateKill(true);
202 return bld
.pseudo(aco_opcode::p_bpermute
, bld
.def(v1
), bld
.def(s2
), bld
.def(s1
, scc
), index_x4
, input_data
, same_half
);
204 /* GFX8-9 or GFX10 wave32: bpermute works normally */
205 Temp index_x4
= bld
.vop2(aco_opcode::v_lshlrev_b32
, bld
.def(v1
), Operand(2u), index
);
206 return bld
.ds(aco_opcode::ds_bpermute_b32
, bld
.def(v1
), index_x4
, data
);
210 static Temp
emit_masked_swizzle(isel_context
*ctx
, Builder
&bld
, Temp src
, unsigned mask
)
212 if (ctx
->options
->chip_class
>= GFX8
) {
213 unsigned and_mask
= mask
& 0x1f;
214 unsigned or_mask
= (mask
>> 5) & 0x1f;
215 unsigned xor_mask
= (mask
>> 10) & 0x1f;
217 uint16_t dpp_ctrl
= 0xffff;
219 // TODO: we could use DPP8 for some swizzles
220 if (and_mask
== 0x1f && or_mask
< 4 && xor_mask
< 4) {
221 unsigned res
[4] = {0, 1, 2, 3};
222 for (unsigned i
= 0; i
< 4; i
++)
223 res
[i
] = ((res
[i
] | or_mask
) ^ xor_mask
) & 0x3;
224 dpp_ctrl
= dpp_quad_perm(res
[0], res
[1], res
[2], res
[3]);
225 } else if (and_mask
== 0x1f && !or_mask
&& xor_mask
== 8) {
226 dpp_ctrl
= dpp_row_rr(8);
227 } else if (and_mask
== 0x1f && !or_mask
&& xor_mask
== 0xf) {
228 dpp_ctrl
= dpp_row_mirror
;
229 } else if (and_mask
== 0x1f && !or_mask
&& xor_mask
== 0x7) {
230 dpp_ctrl
= dpp_row_half_mirror
;
233 if (dpp_ctrl
!= 0xffff)
234 return bld
.vop1_dpp(aco_opcode::v_mov_b32
, bld
.def(v1
), src
, dpp_ctrl
);
237 return bld
.ds(aco_opcode::ds_swizzle_b32
, bld
.def(v1
), src
, mask
, 0, false);
240 Temp
as_vgpr(isel_context
*ctx
, Temp val
)
242 if (val
.type() == RegType::sgpr
) {
243 Builder
bld(ctx
->program
, ctx
->block
);
244 return bld
.copy(bld
.def(RegType::vgpr
, val
.size()), val
);
246 assert(val
.type() == RegType::vgpr
);
250 //assumes a != 0xffffffff
251 void emit_v_div_u32(isel_context
*ctx
, Temp dst
, Temp a
, uint32_t b
)
254 Builder
bld(ctx
->program
, ctx
->block
);
256 if (util_is_power_of_two_or_zero(b
)) {
257 bld
.vop2(aco_opcode::v_lshrrev_b32
, Definition(dst
), Operand((uint32_t)util_logbase2(b
)), a
);
261 util_fast_udiv_info info
= util_compute_fast_udiv_info(b
, 32, 32);
263 assert(info
.multiplier
<= 0xffffffff);
265 bool pre_shift
= info
.pre_shift
!= 0;
266 bool increment
= info
.increment
!= 0;
267 bool multiply
= true;
268 bool post_shift
= info
.post_shift
!= 0;
270 if (!pre_shift
&& !increment
&& !multiply
&& !post_shift
) {
271 bld
.vop1(aco_opcode::v_mov_b32
, Definition(dst
), a
);
275 Temp pre_shift_dst
= a
;
277 pre_shift_dst
= (increment
|| multiply
|| post_shift
) ? bld
.tmp(v1
) : dst
;
278 bld
.vop2(aco_opcode::v_lshrrev_b32
, Definition(pre_shift_dst
), Operand((uint32_t)info
.pre_shift
), a
);
281 Temp increment_dst
= pre_shift_dst
;
283 increment_dst
= (post_shift
|| multiply
) ? bld
.tmp(v1
) : dst
;
284 bld
.vadd32(Definition(increment_dst
), Operand((uint32_t) info
.increment
), pre_shift_dst
);
287 Temp multiply_dst
= increment_dst
;
289 multiply_dst
= post_shift
? bld
.tmp(v1
) : dst
;
290 bld
.vop3(aco_opcode::v_mul_hi_u32
, Definition(multiply_dst
), increment_dst
,
291 bld
.vop1(aco_opcode::v_mov_b32
, bld
.def(v1
), Operand((uint32_t)info
.multiplier
)));
295 bld
.vop2(aco_opcode::v_lshrrev_b32
, Definition(dst
), Operand((uint32_t)info
.post_shift
), multiply_dst
);
299 void emit_extract_vector(isel_context
* ctx
, Temp src
, uint32_t idx
, Temp dst
)
301 Builder
bld(ctx
->program
, ctx
->block
);
302 bld
.pseudo(aco_opcode::p_extract_vector
, Definition(dst
), src
, Operand(idx
));
306 Temp
emit_extract_vector(isel_context
* ctx
, Temp src
, uint32_t idx
, RegClass dst_rc
)
308 /* no need to extract the whole vector */
309 if (src
.regClass() == dst_rc
) {
314 assert(src
.bytes() > (idx
* dst_rc
.bytes()));
315 Builder
bld(ctx
->program
, ctx
->block
);
316 auto it
= ctx
->allocated_vec
.find(src
.id());
317 if (it
!= ctx
->allocated_vec
.end() && dst_rc
.bytes() == it
->second
[idx
].regClass().bytes()) {
318 if (it
->second
[idx
].regClass() == dst_rc
) {
319 return it
->second
[idx
];
321 assert(!dst_rc
.is_subdword());
322 assert(dst_rc
.type() == RegType::vgpr
&& it
->second
[idx
].type() == RegType::sgpr
);
323 return bld
.copy(bld
.def(dst_rc
), it
->second
[idx
]);
327 if (dst_rc
.is_subdword())
328 src
= as_vgpr(ctx
, src
);
330 if (src
.bytes() == dst_rc
.bytes()) {
332 return bld
.copy(bld
.def(dst_rc
), src
);
334 Temp dst
= bld
.tmp(dst_rc
);
335 emit_extract_vector(ctx
, src
, idx
, dst
);
340 void emit_split_vector(isel_context
* ctx
, Temp vec_src
, unsigned num_components
)
342 if (num_components
== 1)
344 if (ctx
->allocated_vec
.find(vec_src
.id()) != ctx
->allocated_vec
.end())
347 if (num_components
> vec_src
.size()) {
348 if (vec_src
.type() == RegType::sgpr
) {
349 /* should still help get_alu_src() */
350 emit_split_vector(ctx
, vec_src
, vec_src
.size());
353 /* sub-dword split */
354 rc
= RegClass(RegType::vgpr
, vec_src
.bytes() / num_components
).as_subdword();
356 rc
= RegClass(vec_src
.type(), vec_src
.size() / num_components
);
358 aco_ptr
<Pseudo_instruction
> split
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_split_vector
, Format::PSEUDO
, 1, num_components
)};
359 split
->operands
[0] = Operand(vec_src
);
360 std::array
<Temp
,NIR_MAX_VEC_COMPONENTS
> elems
;
361 for (unsigned i
= 0; i
< num_components
; i
++) {
362 elems
[i
] = {ctx
->program
->allocateId(), rc
};
363 split
->definitions
[i
] = Definition(elems
[i
]);
365 ctx
->block
->instructions
.emplace_back(std::move(split
));
366 ctx
->allocated_vec
.emplace(vec_src
.id(), elems
);
369 /* This vector expansion uses a mask to determine which elements in the new vector
370 * come from the original vector. The other elements are undefined. */
371 void expand_vector(isel_context
* ctx
, Temp vec_src
, Temp dst
, unsigned num_components
, unsigned mask
)
373 emit_split_vector(ctx
, vec_src
, util_bitcount(mask
));
378 Builder
bld(ctx
->program
, ctx
->block
);
379 if (num_components
== 1) {
380 if (dst
.type() == RegType::sgpr
)
381 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), vec_src
);
383 bld
.copy(Definition(dst
), vec_src
);
387 unsigned component_size
= dst
.size() / num_components
;
388 std::array
<Temp
,NIR_MAX_VEC_COMPONENTS
> elems
;
390 aco_ptr
<Pseudo_instruction
> vec
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, num_components
, 1)};
391 vec
->definitions
[0] = Definition(dst
);
393 for (unsigned i
= 0; i
< num_components
; i
++) {
394 if (mask
& (1 << i
)) {
395 Temp src
= emit_extract_vector(ctx
, vec_src
, k
++, RegClass(vec_src
.type(), component_size
));
396 if (dst
.type() == RegType::sgpr
)
397 src
= bld
.as_uniform(src
);
398 vec
->operands
[i
] = Operand(src
);
400 vec
->operands
[i
] = Operand(0u);
402 elems
[i
] = vec
->operands
[i
].getTemp();
404 ctx
->block
->instructions
.emplace_back(std::move(vec
));
405 ctx
->allocated_vec
.emplace(dst
.id(), elems
);
408 /* adjust misaligned small bit size loads */
409 void byte_align_scalar(isel_context
*ctx
, Temp vec
, Operand offset
, Temp dst
)
411 Builder
bld(ctx
->program
, ctx
->block
);
413 Temp select
= Temp();
414 if (offset
.isConstant()) {
415 assert(offset
.constantValue() && offset
.constantValue() < 4);
416 shift
= Operand(offset
.constantValue() * 8);
418 /* bit_offset = 8 * (offset & 0x3) */
419 Temp tmp
= bld
.sop2(aco_opcode::s_and_b32
, bld
.def(s1
), bld
.def(s1
, scc
), offset
, Operand(3u));
420 select
= bld
.tmp(s1
);
421 shift
= bld
.sop2(aco_opcode::s_lshl_b32
, bld
.def(s1
), bld
.scc(Definition(select
)), tmp
, Operand(3u));
424 if (vec
.size() == 1) {
425 bld
.sop2(aco_opcode::s_lshr_b32
, Definition(dst
), bld
.def(s1
, scc
), vec
, shift
);
426 } else if (vec
.size() == 2) {
427 Temp tmp
= dst
.size() == 2 ? dst
: bld
.tmp(s2
);
428 bld
.sop2(aco_opcode::s_lshr_b64
, Definition(tmp
), bld
.def(s1
, scc
), vec
, shift
);
430 emit_split_vector(ctx
, dst
, 2);
432 emit_extract_vector(ctx
, tmp
, 0, dst
);
433 } else if (vec
.size() == 4) {
434 Temp lo
= bld
.tmp(s2
), hi
= bld
.tmp(s2
);
435 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lo
), Definition(hi
), vec
);
436 hi
= bld
.pseudo(aco_opcode::p_extract_vector
, bld
.def(s1
), hi
, Operand(0u));
437 if (select
!= Temp())
438 hi
= bld
.sop2(aco_opcode::s_cselect_b32
, bld
.def(s1
), hi
, Operand(0u), bld
.scc(select
));
439 lo
= bld
.sop2(aco_opcode::s_lshr_b64
, bld
.def(s2
), bld
.def(s1
, scc
), lo
, shift
);
440 Temp mid
= bld
.tmp(s1
);
441 lo
= bld
.pseudo(aco_opcode::p_split_vector
, bld
.def(s1
), Definition(mid
), lo
);
442 hi
= bld
.sop2(aco_opcode::s_lshl_b32
, bld
.def(s1
), bld
.def(s1
, scc
), hi
, shift
);
443 mid
= bld
.sop2(aco_opcode::s_or_b32
, bld
.def(s1
), bld
.def(s1
, scc
), hi
, mid
);
444 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lo
, mid
);
445 emit_split_vector(ctx
, dst
, 2);
449 void byte_align_vector(isel_context
*ctx
, Temp vec
, Operand offset
, Temp dst
, unsigned component_size
)
451 Builder
bld(ctx
->program
, ctx
->block
);
452 if (offset
.isTemp()) {
453 Temp tmp
[4] = {vec
, vec
, vec
, vec
};
455 if (vec
.size() == 4) {
456 tmp
[0] = bld
.tmp(v1
), tmp
[1] = bld
.tmp(v1
), tmp
[2] = bld
.tmp(v1
), tmp
[3] = bld
.tmp(v1
);
457 bld
.pseudo(aco_opcode::p_split_vector
, Definition(tmp
[0]), Definition(tmp
[1]), Definition(tmp
[2]), Definition(tmp
[3]), vec
);
458 } else if (vec
.size() == 3) {
459 tmp
[0] = bld
.tmp(v1
), tmp
[1] = bld
.tmp(v1
), tmp
[2] = bld
.tmp(v1
);
460 bld
.pseudo(aco_opcode::p_split_vector
, Definition(tmp
[0]), Definition(tmp
[1]), Definition(tmp
[2]), vec
);
461 } else if (vec
.size() == 2) {
462 tmp
[0] = bld
.tmp(v1
), tmp
[1] = bld
.tmp(v1
), tmp
[2] = tmp
[1];
463 bld
.pseudo(aco_opcode::p_split_vector
, Definition(tmp
[0]), Definition(tmp
[1]), vec
);
465 for (unsigned i
= 0; i
< dst
.size(); i
++)
466 tmp
[i
] = bld
.vop3(aco_opcode::v_alignbyte_b32
, bld
.def(v1
), tmp
[i
+ 1], tmp
[i
], offset
);
470 vec
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), tmp
[0], tmp
[1]);
472 offset
= Operand(0u);
475 unsigned num_components
= vec
.bytes() / component_size
;
476 if (vec
.regClass() == dst
.regClass()) {
477 assert(offset
.constantValue() == 0);
478 bld
.copy(Definition(dst
), vec
);
479 emit_split_vector(ctx
, dst
, num_components
);
483 emit_split_vector(ctx
, vec
, num_components
);
484 std::array
<Temp
, NIR_MAX_VEC_COMPONENTS
> elems
;
485 RegClass rc
= RegClass(RegType::vgpr
, component_size
).as_subdword();
487 assert(offset
.constantValue() % component_size
== 0);
488 unsigned skip
= offset
.constantValue() / component_size
;
489 for (unsigned i
= skip
; i
< num_components
; i
++)
490 elems
[i
- skip
] = emit_extract_vector(ctx
, vec
, i
, rc
);
492 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
493 if (dst
.type() == RegType::vgpr
) {
494 num_components
= dst
.bytes() / component_size
;
495 aco_ptr
<Pseudo_instruction
> create_vec
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, num_components
, 1)};
496 for (unsigned i
= 0; i
< num_components
; i
++)
497 create_vec
->operands
[i
] = Operand(elems
[i
]);
498 create_vec
->definitions
[0] = Definition(dst
);
499 bld
.insert(std::move(create_vec
));
501 /* if dst is sgpr - split the src, but move the original to sgpr. */
503 vec
= bld
.pseudo(aco_opcode::p_as_uniform
, bld
.def(RegClass(RegType::sgpr
, vec
.size())), vec
);
504 byte_align_scalar(ctx
, vec
, offset
, dst
);
506 assert(dst
.size() == vec
.size());
507 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), vec
);
510 ctx
->allocated_vec
.emplace(dst
.id(), elems
);
513 Temp
bool_to_vector_condition(isel_context
*ctx
, Temp val
, Temp dst
= Temp(0, s2
))
515 Builder
bld(ctx
->program
, ctx
->block
);
517 dst
= bld
.tmp(bld
.lm
);
519 assert(val
.regClass() == s1
);
520 assert(dst
.regClass() == bld
.lm
);
522 return bld
.sop2(Builder::s_cselect
, Definition(dst
), Operand((uint32_t) -1), Operand(0u), bld
.scc(val
));
525 Temp
bool_to_scalar_condition(isel_context
*ctx
, Temp val
, Temp dst
= Temp(0, s1
))
527 Builder
bld(ctx
->program
, ctx
->block
);
531 assert(val
.regClass() == bld
.lm
);
532 assert(dst
.regClass() == s1
);
534 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
535 Temp tmp
= bld
.tmp(s1
);
536 bld
.sop2(Builder::s_and
, bld
.def(bld
.lm
), bld
.scc(Definition(tmp
)), val
, Operand(exec
, bld
.lm
));
537 return emit_wqm(ctx
, tmp
, dst
);
540 Temp
get_alu_src(struct isel_context
*ctx
, nir_alu_src src
, unsigned size
=1)
542 if (src
.src
.ssa
->num_components
== 1 && src
.swizzle
[0] == 0 && size
== 1)
543 return get_ssa_temp(ctx
, src
.src
.ssa
);
545 if (src
.src
.ssa
->num_components
== size
) {
546 bool identity_swizzle
= true;
547 for (unsigned i
= 0; identity_swizzle
&& i
< size
; i
++) {
548 if (src
.swizzle
[i
] != i
)
549 identity_swizzle
= false;
551 if (identity_swizzle
)
552 return get_ssa_temp(ctx
, src
.src
.ssa
);
555 Temp vec
= get_ssa_temp(ctx
, src
.src
.ssa
);
556 unsigned elem_size
= vec
.bytes() / src
.src
.ssa
->num_components
;
557 assert(elem_size
> 0);
558 assert(vec
.bytes() % elem_size
== 0);
560 if (elem_size
< 4 && vec
.type() == RegType::sgpr
) {
561 assert(src
.src
.ssa
->bit_size
== 8 || src
.src
.ssa
->bit_size
== 16);
563 unsigned swizzle
= src
.swizzle
[0];
564 if (vec
.size() > 1) {
565 assert(src
.src
.ssa
->bit_size
== 16);
566 vec
= emit_extract_vector(ctx
, vec
, swizzle
/ 2, s1
);
567 swizzle
= swizzle
& 1;
572 Temp dst
{ctx
->program
->allocateId(), s1
};
573 aco_ptr
<SOP2_instruction
> bfe
{create_instruction
<SOP2_instruction
>(aco_opcode::s_bfe_u32
, Format::SOP2
, 2, 2)};
574 bfe
->operands
[0] = Operand(vec
);
575 bfe
->operands
[1] = Operand(uint32_t((src
.src
.ssa
->bit_size
<< 16) | (src
.src
.ssa
->bit_size
* swizzle
)));
576 bfe
->definitions
[0] = Definition(dst
);
577 bfe
->definitions
[1] = Definition(ctx
->program
->allocateId(), scc
, s1
);
578 ctx
->block
->instructions
.emplace_back(std::move(bfe
));
582 RegClass elem_rc
= elem_size
< 4 ? RegClass(vec
.type(), elem_size
).as_subdword() : RegClass(vec
.type(), elem_size
/ 4);
584 return emit_extract_vector(ctx
, vec
, src
.swizzle
[0], elem_rc
);
587 std::array
<Temp
,NIR_MAX_VEC_COMPONENTS
> elems
;
588 aco_ptr
<Pseudo_instruction
> vec_instr
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, size
, 1)};
589 for (unsigned i
= 0; i
< size
; ++i
) {
590 elems
[i
] = emit_extract_vector(ctx
, vec
, src
.swizzle
[i
], elem_rc
);
591 vec_instr
->operands
[i
] = Operand
{elems
[i
]};
593 Temp dst
{ctx
->program
->allocateId(), RegClass(vec
.type(), elem_size
* size
/ 4)};
594 vec_instr
->definitions
[0] = Definition(dst
);
595 ctx
->block
->instructions
.emplace_back(std::move(vec_instr
));
596 ctx
->allocated_vec
.emplace(dst
.id(), elems
);
601 Temp
convert_pointer_to_64_bit(isel_context
*ctx
, Temp ptr
)
605 Builder
bld(ctx
->program
, ctx
->block
);
606 if (ptr
.type() == RegType::vgpr
)
607 ptr
= bld
.vop1(aco_opcode::v_readfirstlane_b32
, bld
.def(s1
), ptr
);
608 return bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(s2
),
609 ptr
, Operand((unsigned)ctx
->options
->address32_hi
));
612 void emit_sop2_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
, bool writes_scc
)
614 aco_ptr
<SOP2_instruction
> sop2
{create_instruction
<SOP2_instruction
>(op
, Format::SOP2
, 2, writes_scc
? 2 : 1)};
615 sop2
->operands
[0] = Operand(get_alu_src(ctx
, instr
->src
[0]));
616 sop2
->operands
[1] = Operand(get_alu_src(ctx
, instr
->src
[1]));
617 sop2
->definitions
[0] = Definition(dst
);
618 if (instr
->no_unsigned_wrap
)
619 sop2
->definitions
[0].setNUW(true);
621 sop2
->definitions
[1] = Definition(ctx
->program
->allocateId(), scc
, s1
);
622 ctx
->block
->instructions
.emplace_back(std::move(sop2
));
625 void emit_vop2_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
,
626 bool commutative
, bool swap_srcs
=false, bool flush_denorms
= false)
628 Builder
bld(ctx
->program
, ctx
->block
);
629 bld
.is_precise
= instr
->exact
;
631 Temp src0
= get_alu_src(ctx
, instr
->src
[swap_srcs
? 1 : 0]);
632 Temp src1
= get_alu_src(ctx
, instr
->src
[swap_srcs
? 0 : 1]);
633 if (src1
.type() == RegType::sgpr
) {
634 if (commutative
&& src0
.type() == RegType::vgpr
) {
639 src1
= as_vgpr(ctx
, src1
);
643 if (flush_denorms
&& ctx
->program
->chip_class
< GFX9
) {
644 assert(dst
.size() == 1);
645 Temp tmp
= bld
.vop2(op
, bld
.def(v1
), src0
, src1
);
646 bld
.vop2(aco_opcode::v_mul_f32
, Definition(dst
), Operand(0x3f800000u
), tmp
);
648 bld
.vop2(op
, Definition(dst
), src0
, src1
);
652 void emit_vop2_instruction_logic64(isel_context
*ctx
, nir_alu_instr
*instr
,
653 aco_opcode op
, Temp dst
)
655 Builder
bld(ctx
->program
, ctx
->block
);
656 bld
.is_precise
= instr
->exact
;
658 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
659 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
661 if (src1
.type() == RegType::sgpr
) {
662 assert(src0
.type() == RegType::vgpr
);
663 std::swap(src0
, src1
);
666 Temp src00
= bld
.tmp(src0
.type(), 1);
667 Temp src01
= bld
.tmp(src0
.type(), 1);
668 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
669 Temp src10
= bld
.tmp(v1
);
670 Temp src11
= bld
.tmp(v1
);
671 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
672 Temp lo
= bld
.vop2(op
, bld
.def(v1
), src00
, src10
);
673 Temp hi
= bld
.vop2(op
, bld
.def(v1
), src01
, src11
);
674 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lo
, hi
);
677 void emit_vop3a_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
,
678 bool flush_denorms
= false)
680 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
681 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
682 Temp src2
= get_alu_src(ctx
, instr
->src
[2]);
684 /* ensure that the instruction has at most 1 sgpr operand
685 * The optimizer will inline constants for us */
686 if (src0
.type() == RegType::sgpr
&& src1
.type() == RegType::sgpr
)
687 src0
= as_vgpr(ctx
, src0
);
688 if (src1
.type() == RegType::sgpr
&& src2
.type() == RegType::sgpr
)
689 src1
= as_vgpr(ctx
, src1
);
690 if (src2
.type() == RegType::sgpr
&& src0
.type() == RegType::sgpr
)
691 src2
= as_vgpr(ctx
, src2
);
693 Builder
bld(ctx
->program
, ctx
->block
);
694 bld
.is_precise
= instr
->exact
;
695 if (flush_denorms
&& ctx
->program
->chip_class
< GFX9
) {
696 assert(dst
.size() == 1);
697 Temp tmp
= bld
.vop3(op
, Definition(dst
), src0
, src1
, src2
);
698 bld
.vop2(aco_opcode::v_mul_f32
, Definition(dst
), Operand(0x3f800000u
), tmp
);
700 bld
.vop3(op
, Definition(dst
), src0
, src1
, src2
);
704 void emit_vop1_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
)
706 Builder
bld(ctx
->program
, ctx
->block
);
707 bld
.is_precise
= instr
->exact
;
708 if (dst
.type() == RegType::sgpr
)
709 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
),
710 bld
.vop1(op
, bld
.def(RegType::vgpr
, dst
.size()), get_alu_src(ctx
, instr
->src
[0])));
712 bld
.vop1(op
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]));
715 void emit_vopc_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
)
717 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
718 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
719 assert(src0
.size() == src1
.size());
721 aco_ptr
<Instruction
> vopc
;
722 if (src1
.type() == RegType::sgpr
) {
723 if (src0
.type() == RegType::vgpr
) {
724 /* to swap the operands, we might also have to change the opcode */
726 case aco_opcode::v_cmp_lt_f16
:
727 op
= aco_opcode::v_cmp_gt_f16
;
729 case aco_opcode::v_cmp_ge_f16
:
730 op
= aco_opcode::v_cmp_le_f16
;
732 case aco_opcode::v_cmp_lt_i16
:
733 op
= aco_opcode::v_cmp_gt_i16
;
735 case aco_opcode::v_cmp_ge_i16
:
736 op
= aco_opcode::v_cmp_le_i16
;
738 case aco_opcode::v_cmp_lt_u16
:
739 op
= aco_opcode::v_cmp_gt_u16
;
741 case aco_opcode::v_cmp_ge_u16
:
742 op
= aco_opcode::v_cmp_le_u16
;
744 case aco_opcode::v_cmp_lt_f32
:
745 op
= aco_opcode::v_cmp_gt_f32
;
747 case aco_opcode::v_cmp_ge_f32
:
748 op
= aco_opcode::v_cmp_le_f32
;
750 case aco_opcode::v_cmp_lt_i32
:
751 op
= aco_opcode::v_cmp_gt_i32
;
753 case aco_opcode::v_cmp_ge_i32
:
754 op
= aco_opcode::v_cmp_le_i32
;
756 case aco_opcode::v_cmp_lt_u32
:
757 op
= aco_opcode::v_cmp_gt_u32
;
759 case aco_opcode::v_cmp_ge_u32
:
760 op
= aco_opcode::v_cmp_le_u32
;
762 case aco_opcode::v_cmp_lt_f64
:
763 op
= aco_opcode::v_cmp_gt_f64
;
765 case aco_opcode::v_cmp_ge_f64
:
766 op
= aco_opcode::v_cmp_le_f64
;
768 case aco_opcode::v_cmp_lt_i64
:
769 op
= aco_opcode::v_cmp_gt_i64
;
771 case aco_opcode::v_cmp_ge_i64
:
772 op
= aco_opcode::v_cmp_le_i64
;
774 case aco_opcode::v_cmp_lt_u64
:
775 op
= aco_opcode::v_cmp_gt_u64
;
777 case aco_opcode::v_cmp_ge_u64
:
778 op
= aco_opcode::v_cmp_le_u64
;
780 default: /* eq and ne are commutative */
787 src1
= as_vgpr(ctx
, src1
);
791 Builder
bld(ctx
->program
, ctx
->block
);
792 bld
.vopc(op
, bld
.hint_vcc(Definition(dst
)), src0
, src1
);
795 void emit_sopc_instruction(isel_context
*ctx
, nir_alu_instr
*instr
, aco_opcode op
, Temp dst
)
797 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
798 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
799 Builder
bld(ctx
->program
, ctx
->block
);
801 assert(dst
.regClass() == bld
.lm
);
802 assert(src0
.type() == RegType::sgpr
);
803 assert(src1
.type() == RegType::sgpr
);
804 assert(src0
.regClass() == src1
.regClass());
806 /* Emit the SALU comparison instruction */
807 Temp cmp
= bld
.sopc(op
, bld
.scc(bld
.def(s1
)), src0
, src1
);
808 /* Turn the result into a per-lane bool */
809 bool_to_vector_condition(ctx
, cmp
, dst
);
812 void emit_comparison(isel_context
*ctx
, nir_alu_instr
*instr
, Temp dst
,
813 aco_opcode v16_op
, aco_opcode v32_op
, aco_opcode v64_op
, aco_opcode s32_op
= aco_opcode::num_opcodes
, aco_opcode s64_op
= aco_opcode::num_opcodes
)
815 aco_opcode s_op
= instr
->src
[0].src
.ssa
->bit_size
== 64 ? s64_op
: instr
->src
[0].src
.ssa
->bit_size
== 32 ? s32_op
: aco_opcode::num_opcodes
;
816 aco_opcode v_op
= instr
->src
[0].src
.ssa
->bit_size
== 64 ? v64_op
: instr
->src
[0].src
.ssa
->bit_size
== 32 ? v32_op
: v16_op
;
817 bool use_valu
= s_op
== aco_opcode::num_opcodes
||
818 nir_dest_is_divergent(instr
->dest
.dest
) ||
819 ctx
->allocated
[instr
->src
[0].src
.ssa
->index
].type() == RegType::vgpr
||
820 ctx
->allocated
[instr
->src
[1].src
.ssa
->index
].type() == RegType::vgpr
;
821 aco_opcode op
= use_valu
? v_op
: s_op
;
822 assert(op
!= aco_opcode::num_opcodes
);
823 assert(dst
.regClass() == ctx
->program
->lane_mask
);
826 emit_vopc_instruction(ctx
, instr
, op
, dst
);
828 emit_sopc_instruction(ctx
, instr
, op
, dst
);
831 void emit_boolean_logic(isel_context
*ctx
, nir_alu_instr
*instr
, Builder::WaveSpecificOpcode op
, Temp dst
)
833 Builder
bld(ctx
->program
, ctx
->block
);
834 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
835 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
837 assert(dst
.regClass() == bld
.lm
);
838 assert(src0
.regClass() == bld
.lm
);
839 assert(src1
.regClass() == bld
.lm
);
841 bld
.sop2(op
, Definition(dst
), bld
.def(s1
, scc
), src0
, src1
);
844 void emit_bcsel(isel_context
*ctx
, nir_alu_instr
*instr
, Temp dst
)
846 Builder
bld(ctx
->program
, ctx
->block
);
847 Temp cond
= get_alu_src(ctx
, instr
->src
[0]);
848 Temp then
= get_alu_src(ctx
, instr
->src
[1]);
849 Temp els
= get_alu_src(ctx
, instr
->src
[2]);
851 assert(cond
.regClass() == bld
.lm
);
853 if (dst
.type() == RegType::vgpr
) {
854 aco_ptr
<Instruction
> bcsel
;
855 if (dst
.size() == 1) {
856 then
= as_vgpr(ctx
, then
);
857 els
= as_vgpr(ctx
, els
);
859 bld
.vop2(aco_opcode::v_cndmask_b32
, Definition(dst
), els
, then
, cond
);
860 } else if (dst
.size() == 2) {
861 Temp then_lo
= bld
.tmp(v1
), then_hi
= bld
.tmp(v1
);
862 bld
.pseudo(aco_opcode::p_split_vector
, Definition(then_lo
), Definition(then_hi
), then
);
863 Temp else_lo
= bld
.tmp(v1
), else_hi
= bld
.tmp(v1
);
864 bld
.pseudo(aco_opcode::p_split_vector
, Definition(else_lo
), Definition(else_hi
), els
);
866 Temp dst0
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), else_lo
, then_lo
, cond
);
867 Temp dst1
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), else_hi
, then_hi
, cond
);
869 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
871 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
872 nir_print_instr(&instr
->instr
, stderr
);
873 fprintf(stderr
, "\n");
878 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
879 assert(dst
.regClass() == bld
.lm
);
880 assert(then
.regClass() == bld
.lm
);
881 assert(els
.regClass() == bld
.lm
);
884 if (!nir_src_is_divergent(instr
->src
[0].src
)) { /* uniform condition and values in sgpr */
885 if (dst
.regClass() == s1
|| dst
.regClass() == s2
) {
886 assert((then
.regClass() == s1
|| then
.regClass() == s2
) && els
.regClass() == then
.regClass());
887 assert(dst
.size() == then
.size());
888 aco_opcode op
= dst
.regClass() == s1
? aco_opcode::s_cselect_b32
: aco_opcode::s_cselect_b64
;
889 bld
.sop2(op
, Definition(dst
), then
, els
, bld
.scc(bool_to_scalar_condition(ctx
, cond
)));
891 fprintf(stderr
, "Unimplemented uniform bcsel bit size: ");
892 nir_print_instr(&instr
->instr
, stderr
);
893 fprintf(stderr
, "\n");
898 /* divergent boolean bcsel
899 * this implements bcsel on bools: dst = s0 ? s1 : s2
900 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
901 assert(instr
->dest
.dest
.ssa
.bit_size
== 1);
903 if (cond
.id() != then
.id())
904 then
= bld
.sop2(Builder::s_and
, bld
.def(bld
.lm
), bld
.def(s1
, scc
), cond
, then
);
906 if (cond
.id() == els
.id())
907 bld
.sop1(Builder::s_mov
, Definition(dst
), then
);
909 bld
.sop2(Builder::s_or
, Definition(dst
), bld
.def(s1
, scc
), then
,
910 bld
.sop2(Builder::s_andn2
, bld
.def(bld
.lm
), bld
.def(s1
, scc
), els
, cond
));
913 void emit_scaled_op(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
,
914 aco_opcode op
, uint32_t undo
)
916 /* multiply by 16777216 to handle denormals */
917 Temp is_denormal
= bld
.vopc(aco_opcode::v_cmp_class_f32
, bld
.hint_vcc(bld
.def(bld
.lm
)),
918 as_vgpr(ctx
, val
), bld
.copy(bld
.def(v1
), Operand((1u << 7) | (1u << 4))));
919 Temp scaled
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), Operand(0x4b800000u
), val
);
920 scaled
= bld
.vop1(op
, bld
.def(v1
), scaled
);
921 scaled
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), Operand(undo
), scaled
);
923 Temp not_scaled
= bld
.vop1(op
, bld
.def(v1
), val
);
925 bld
.vop2(aco_opcode::v_cndmask_b32
, dst
, not_scaled
, scaled
, is_denormal
);
928 void emit_rcp(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
930 if (ctx
->block
->fp_mode
.denorm32
== 0) {
931 bld
.vop1(aco_opcode::v_rcp_f32
, dst
, val
);
935 emit_scaled_op(ctx
, bld
, dst
, val
, aco_opcode::v_rcp_f32
, 0x4b800000u
);
938 void emit_rsq(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
940 if (ctx
->block
->fp_mode
.denorm32
== 0) {
941 bld
.vop1(aco_opcode::v_rsq_f32
, dst
, val
);
945 emit_scaled_op(ctx
, bld
, dst
, val
, aco_opcode::v_rsq_f32
, 0x45800000u
);
948 void emit_sqrt(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
950 if (ctx
->block
->fp_mode
.denorm32
== 0) {
951 bld
.vop1(aco_opcode::v_sqrt_f32
, dst
, val
);
955 emit_scaled_op(ctx
, bld
, dst
, val
, aco_opcode::v_sqrt_f32
, 0x39800000u
);
958 void emit_log2(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
960 if (ctx
->block
->fp_mode
.denorm32
== 0) {
961 bld
.vop1(aco_opcode::v_log_f32
, dst
, val
);
965 emit_scaled_op(ctx
, bld
, dst
, val
, aco_opcode::v_log_f32
, 0xc1c00000u
);
968 Temp
emit_trunc_f64(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
970 if (ctx
->options
->chip_class
>= GFX7
)
971 return bld
.vop1(aco_opcode::v_trunc_f64
, Definition(dst
), val
);
973 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
974 /* TODO: create more efficient code! */
975 if (val
.type() == RegType::sgpr
)
976 val
= as_vgpr(ctx
, val
);
978 /* Split the input value. */
979 Temp val_lo
= bld
.tmp(v1
), val_hi
= bld
.tmp(v1
);
980 bld
.pseudo(aco_opcode::p_split_vector
, Definition(val_lo
), Definition(val_hi
), val
);
982 /* Extract the exponent and compute the unbiased value. */
983 Temp exponent
= bld
.vop3(aco_opcode::v_bfe_u32
, bld
.def(v1
), val_hi
, Operand(20u), Operand(11u));
984 exponent
= bld
.vsub32(bld
.def(v1
), exponent
, Operand(1023u));
986 /* Extract the fractional part. */
987 Temp fract_mask
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(-1u), Operand(0x000fffffu
));
988 fract_mask
= bld
.vop3(aco_opcode::v_lshr_b64
, bld
.def(v2
), fract_mask
, exponent
);
990 Temp fract_mask_lo
= bld
.tmp(v1
), fract_mask_hi
= bld
.tmp(v1
);
991 bld
.pseudo(aco_opcode::p_split_vector
, Definition(fract_mask_lo
), Definition(fract_mask_hi
), fract_mask
);
993 Temp fract_lo
= bld
.tmp(v1
), fract_hi
= bld
.tmp(v1
);
994 Temp tmp
= bld
.vop1(aco_opcode::v_not_b32
, bld
.def(v1
), fract_mask_lo
);
995 fract_lo
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), val_lo
, tmp
);
996 tmp
= bld
.vop1(aco_opcode::v_not_b32
, bld
.def(v1
), fract_mask_hi
);
997 fract_hi
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), val_hi
, tmp
);
999 /* Get the sign bit. */
1000 Temp sign
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), Operand(0x80000000u
), val_hi
);
1002 /* Decide the operation to apply depending on the unbiased exponent. */
1003 Temp exp_lt0
= bld
.vopc_e64(aco_opcode::v_cmp_lt_i32
, bld
.hint_vcc(bld
.def(bld
.lm
)), exponent
, Operand(0u));
1004 Temp dst_lo
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), fract_lo
, bld
.copy(bld
.def(v1
), Operand(0u)), exp_lt0
);
1005 Temp dst_hi
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), fract_hi
, sign
, exp_lt0
);
1006 Temp exp_gt51
= bld
.vopc_e64(aco_opcode::v_cmp_gt_i32
, bld
.def(s2
), exponent
, Operand(51u));
1007 dst_lo
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), dst_lo
, val_lo
, exp_gt51
);
1008 dst_hi
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), dst_hi
, val_hi
, exp_gt51
);
1010 return bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst_lo
, dst_hi
);
1013 Temp
emit_floor_f64(isel_context
*ctx
, Builder
& bld
, Definition dst
, Temp val
)
1015 if (ctx
->options
->chip_class
>= GFX7
)
1016 return bld
.vop1(aco_opcode::v_floor_f64
, Definition(dst
), val
);
1018 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1019 * lowered at NIR level for precision reasons). */
1020 Temp src0
= as_vgpr(ctx
, val
);
1022 Temp mask
= bld
.copy(bld
.def(s1
), Operand(3u)); /* isnan */
1023 Temp min_val
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(s2
), Operand(-1u), Operand(0x3fefffffu
));
1025 Temp isnan
= bld
.vopc_e64(aco_opcode::v_cmp_class_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), src0
, mask
);
1026 Temp fract
= bld
.vop1(aco_opcode::v_fract_f64
, bld
.def(v2
), src0
);
1027 Temp min
= bld
.vop3(aco_opcode::v_min_f64
, bld
.def(v2
), fract
, min_val
);
1029 Temp then_lo
= bld
.tmp(v1
), then_hi
= bld
.tmp(v1
);
1030 bld
.pseudo(aco_opcode::p_split_vector
, Definition(then_lo
), Definition(then_hi
), src0
);
1031 Temp else_lo
= bld
.tmp(v1
), else_hi
= bld
.tmp(v1
);
1032 bld
.pseudo(aco_opcode::p_split_vector
, Definition(else_lo
), Definition(else_hi
), min
);
1034 Temp dst0
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), else_lo
, then_lo
, isnan
);
1035 Temp dst1
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), else_hi
, then_hi
, isnan
);
1037 Temp v
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), dst0
, dst1
);
1039 Instruction
* add
= bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), src0
, v
);
1040 static_cast<VOP3A_instruction
*>(add
)->neg
[1] = true;
1042 return add
->definitions
[0].getTemp();
1045 Temp
convert_int(isel_context
*ctx
, Builder
& bld
, Temp src
, unsigned src_bits
, unsigned dst_bits
, bool is_signed
, Temp dst
=Temp()) {
1047 if (dst_bits
% 32 == 0 || src
.type() == RegType::sgpr
)
1048 dst
= bld
.tmp(src
.type(), DIV_ROUND_UP(dst_bits
, 32u));
1050 dst
= bld
.tmp(RegClass(RegType::vgpr
, dst_bits
/ 8u).as_subdword());
1053 if (dst
.bytes() == src
.bytes() && dst_bits
< src_bits
)
1054 return bld
.copy(Definition(dst
), src
);
1055 else if (dst
.bytes() < src
.bytes())
1056 return bld
.pseudo(aco_opcode::p_extract_vector
, Definition(dst
), src
, Operand(0u));
1060 tmp
= src_bits
== 32 ? src
: bld
.tmp(src
.type(), 1);
1063 } else if (src
.regClass() == s1
) {
1065 bld
.sop1(src_bits
== 8 ? aco_opcode::s_sext_i32_i8
: aco_opcode::s_sext_i32_i16
, Definition(tmp
), src
);
1067 bld
.sop2(aco_opcode::s_and_b32
, Definition(tmp
), bld
.def(s1
, scc
), Operand(src_bits
== 8 ? 0xFFu
: 0xFFFFu
), src
);
1068 } else if (ctx
->options
->chip_class
>= GFX8
) {
1069 assert(src_bits
!= 8 || src
.regClass() == v1b
);
1070 assert(src_bits
!= 16 || src
.regClass() == v2b
);
1071 aco_ptr
<SDWA_instruction
> sdwa
{create_instruction
<SDWA_instruction
>(aco_opcode::v_mov_b32
, asSDWA(Format::VOP1
), 1, 1)};
1072 sdwa
->operands
[0] = Operand(src
);
1073 sdwa
->definitions
[0] = Definition(tmp
);
1075 sdwa
->sel
[0] = src_bits
== 8 ? sdwa_sbyte
: sdwa_sword
;
1077 sdwa
->sel
[0] = src_bits
== 8 ? sdwa_ubyte
: sdwa_uword
;
1078 sdwa
->dst_sel
= tmp
.bytes() == 2 ? sdwa_uword
: sdwa_udword
;
1079 bld
.insert(std::move(sdwa
));
1081 assert(ctx
->options
->chip_class
== GFX6
|| ctx
->options
->chip_class
== GFX7
);
1082 aco_opcode opcode
= is_signed
? aco_opcode::v_bfe_i32
: aco_opcode::v_bfe_u32
;
1083 bld
.vop3(opcode
, Definition(tmp
), src
, Operand(0u), Operand(src_bits
== 8 ? 8u : 16u));
1086 if (dst_bits
== 64) {
1087 if (is_signed
&& dst
.regClass() == s2
) {
1088 Temp high
= bld
.sop2(aco_opcode::s_ashr_i32
, bld
.def(s1
), bld
.def(s1
, scc
), tmp
, Operand(31u));
1089 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), tmp
, high
);
1090 } else if (is_signed
&& dst
.regClass() == v2
) {
1091 Temp high
= bld
.vop2(aco_opcode::v_ashrrev_i32
, bld
.def(v1
), Operand(31u), tmp
);
1092 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), tmp
, high
);
1094 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), tmp
, Operand(0u));
1101 void visit_alu_instr(isel_context
*ctx
, nir_alu_instr
*instr
)
1103 if (!instr
->dest
.dest
.is_ssa
) {
1104 fprintf(stderr
, "nir alu dst not in ssa: ");
1105 nir_print_instr(&instr
->instr
, stderr
);
1106 fprintf(stderr
, "\n");
1109 Builder
bld(ctx
->program
, ctx
->block
);
1110 bld
.is_precise
= instr
->exact
;
1111 Temp dst
= get_ssa_temp(ctx
, &instr
->dest
.dest
.ssa
);
1116 std::array
<Temp
,NIR_MAX_VEC_COMPONENTS
> elems
;
1117 unsigned num
= instr
->dest
.dest
.ssa
.num_components
;
1118 for (unsigned i
= 0; i
< num
; ++i
)
1119 elems
[i
] = get_alu_src(ctx
, instr
->src
[i
]);
1121 if (instr
->dest
.dest
.ssa
.bit_size
>= 32 || dst
.type() == RegType::vgpr
) {
1122 aco_ptr
<Pseudo_instruction
> vec
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, instr
->dest
.dest
.ssa
.num_components
, 1)};
1123 RegClass elem_rc
= RegClass::get(RegType::vgpr
, instr
->dest
.dest
.ssa
.bit_size
/ 8u);
1124 for (unsigned i
= 0; i
< num
; ++i
) {
1125 if (elems
[i
].type() == RegType::sgpr
&& elem_rc
.is_subdword())
1126 vec
->operands
[i
] = Operand(emit_extract_vector(ctx
, elems
[i
], 0, elem_rc
));
1128 vec
->operands
[i
] = Operand
{elems
[i
]};
1130 vec
->definitions
[0] = Definition(dst
);
1131 ctx
->block
->instructions
.emplace_back(std::move(vec
));
1132 ctx
->allocated_vec
.emplace(dst
.id(), elems
);
1134 // TODO: that is a bit suboptimal..
1135 Temp mask
= bld
.copy(bld
.def(s1
), Operand((1u << instr
->dest
.dest
.ssa
.bit_size
) - 1));
1136 for (unsigned i
= 0; i
< num
- 1; ++i
)
1137 if (((i
+1) * instr
->dest
.dest
.ssa
.bit_size
) % 32)
1138 elems
[i
] = bld
.sop2(aco_opcode::s_and_b32
, bld
.def(s1
), bld
.def(s1
, scc
), elems
[i
], mask
);
1139 for (unsigned i
= 0; i
< num
; ++i
) {
1140 unsigned bit
= i
* instr
->dest
.dest
.ssa
.bit_size
;
1141 if (bit
% 32 == 0) {
1142 elems
[bit
/ 32] = elems
[i
];
1144 elems
[i
] = bld
.sop2(aco_opcode::s_lshl_b32
, bld
.def(s1
), bld
.def(s1
, scc
),
1145 elems
[i
], Operand((i
* instr
->dest
.dest
.ssa
.bit_size
) % 32));
1146 elems
[bit
/ 32] = bld
.sop2(aco_opcode::s_or_b32
, bld
.def(s1
), bld
.def(s1
, scc
), elems
[bit
/ 32], elems
[i
]);
1149 if (dst
.size() == 1)
1150 bld
.copy(Definition(dst
), elems
[0]);
1152 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), elems
[0], elems
[1]);
1157 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1158 aco_ptr
<Instruction
> mov
;
1159 if (dst
.type() == RegType::sgpr
) {
1160 if (src
.type() == RegType::vgpr
)
1161 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), src
);
1162 else if (src
.regClass() == s1
)
1163 bld
.sop1(aco_opcode::s_mov_b32
, Definition(dst
), src
);
1164 else if (src
.regClass() == s2
)
1165 bld
.sop1(aco_opcode::s_mov_b64
, Definition(dst
), src
);
1167 unreachable("wrong src register class for nir_op_imov");
1169 if (dst
.regClass() == v1
)
1170 bld
.vop1(aco_opcode::v_mov_b32
, Definition(dst
), src
);
1171 else if (dst
.regClass() == v1b
||
1172 dst
.regClass() == v2b
||
1173 dst
.regClass() == v2
)
1174 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), src
);
1176 unreachable("wrong src register class for nir_op_imov");
1181 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1182 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
1183 assert(src
.regClass() == bld
.lm
);
1184 assert(dst
.regClass() == bld
.lm
);
1185 /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1186 Temp tmp
= bld
.sop1(Builder::s_not
, bld
.def(bld
.lm
), bld
.def(s1
, scc
), src
);
1187 bld
.sop2(Builder::s_and
, Definition(dst
), bld
.def(s1
, scc
), tmp
, Operand(exec
, bld
.lm
));
1188 } else if (dst
.regClass() == v1
) {
1189 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_not_b32
, dst
);
1190 } else if (dst
.regClass() == v2
) {
1191 Temp lo
= bld
.tmp(v1
), hi
= bld
.tmp(v1
);
1192 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lo
), Definition(hi
), src
);
1193 lo
= bld
.vop1(aco_opcode::v_not_b32
, bld
.def(v1
), lo
);
1194 hi
= bld
.vop1(aco_opcode::v_not_b32
, bld
.def(v1
), hi
);
1195 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lo
, hi
);
1196 } else if (dst
.type() == RegType::sgpr
) {
1197 aco_opcode opcode
= dst
.size() == 1 ? aco_opcode::s_not_b32
: aco_opcode::s_not_b64
;
1198 bld
.sop1(opcode
, Definition(dst
), bld
.def(s1
, scc
), src
);
1200 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1201 nir_print_instr(&instr
->instr
, stderr
);
1202 fprintf(stderr
, "\n");
1207 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1208 if (dst
.regClass() == v1
) {
1209 bld
.vsub32(Definition(dst
), Operand(0u), Operand(src
));
1210 } else if (dst
.regClass() == s1
) {
1211 bld
.sop2(aco_opcode::s_mul_i32
, Definition(dst
), Operand((uint32_t) -1), src
);
1212 } else if (dst
.size() == 2) {
1213 Temp src0
= bld
.tmp(dst
.type(), 1);
1214 Temp src1
= bld
.tmp(dst
.type(), 1);
1215 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src0
), Definition(src1
), src
);
1217 if (dst
.regClass() == s2
) {
1218 Temp carry
= bld
.tmp(s1
);
1219 Temp dst0
= bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(carry
)), Operand(0u), src0
);
1220 Temp dst1
= bld
.sop2(aco_opcode::s_subb_u32
, bld
.def(s1
), bld
.def(s1
, scc
), Operand(0u), src1
, carry
);
1221 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
1223 Temp lower
= bld
.tmp(v1
);
1224 Temp borrow
= bld
.vsub32(Definition(lower
), Operand(0u), src0
, true).def(1).getTemp();
1225 Temp upper
= bld
.vsub32(bld
.def(v1
), Operand(0u), src1
, false, borrow
);
1226 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1229 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1230 nir_print_instr(&instr
->instr
, stderr
);
1231 fprintf(stderr
, "\n");
1236 if (dst
.regClass() == s1
) {
1237 bld
.sop1(aco_opcode::s_abs_i32
, Definition(dst
), bld
.def(s1
, scc
), get_alu_src(ctx
, instr
->src
[0]));
1238 } else if (dst
.regClass() == v1
) {
1239 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1240 bld
.vop2(aco_opcode::v_max_i32
, Definition(dst
), src
, bld
.vsub32(bld
.def(v1
), Operand(0u), src
));
1242 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1243 nir_print_instr(&instr
->instr
, stderr
);
1244 fprintf(stderr
, "\n");
1248 case nir_op_isign
: {
1249 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1250 if (dst
.regClass() == s1
) {
1251 Temp tmp
= bld
.sop2(aco_opcode::s_max_i32
, bld
.def(s1
), bld
.def(s1
, scc
), src
, Operand((uint32_t)-1));
1252 bld
.sop2(aco_opcode::s_min_i32
, Definition(dst
), bld
.def(s1
, scc
), tmp
, Operand(1u));
1253 } else if (dst
.regClass() == s2
) {
1254 Temp neg
= bld
.sop2(aco_opcode::s_ashr_i64
, bld
.def(s2
), bld
.def(s1
, scc
), src
, Operand(63u));
1256 if (ctx
->program
->chip_class
>= GFX8
)
1257 neqz
= bld
.sopc(aco_opcode::s_cmp_lg_u64
, bld
.def(s1
, scc
), src
, Operand(0u));
1259 neqz
= bld
.sop2(aco_opcode::s_or_b64
, bld
.def(s2
), bld
.def(s1
, scc
), src
, Operand(0u)).def(1).getTemp();
1260 /* SCC gets zero-extended to 64 bit */
1261 bld
.sop2(aco_opcode::s_or_b64
, Definition(dst
), bld
.def(s1
, scc
), neg
, bld
.scc(neqz
));
1262 } else if (dst
.regClass() == v1
) {
1263 bld
.vop3(aco_opcode::v_med3_i32
, Definition(dst
), Operand((uint32_t)-1), src
, Operand(1u));
1264 } else if (dst
.regClass() == v2
) {
1265 Temp upper
= emit_extract_vector(ctx
, src
, 1, v1
);
1266 Temp neg
= bld
.vop2(aco_opcode::v_ashrrev_i32
, bld
.def(v1
), Operand(31u), upper
);
1267 Temp gtz
= bld
.vopc(aco_opcode::v_cmp_ge_i64
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
1268 Temp lower
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(1u), neg
, gtz
);
1269 upper
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(0u), neg
, gtz
);
1270 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1272 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1273 nir_print_instr(&instr
->instr
, stderr
);
1274 fprintf(stderr
, "\n");
1279 if (dst
.regClass() == v1
) {
1280 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_max_i32
, dst
, true);
1281 } else if (dst
.regClass() == s1
) {
1282 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_max_i32
, dst
, true);
1284 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1285 nir_print_instr(&instr
->instr
, stderr
);
1286 fprintf(stderr
, "\n");
1291 if (dst
.regClass() == v1
) {
1292 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_max_u32
, dst
, true);
1293 } else if (dst
.regClass() == s1
) {
1294 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_max_u32
, dst
, true);
1296 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1297 nir_print_instr(&instr
->instr
, stderr
);
1298 fprintf(stderr
, "\n");
1303 if (dst
.regClass() == v1
) {
1304 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_min_i32
, dst
, true);
1305 } else if (dst
.regClass() == s1
) {
1306 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_min_i32
, dst
, true);
1308 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1309 nir_print_instr(&instr
->instr
, stderr
);
1310 fprintf(stderr
, "\n");
1315 if (dst
.regClass() == v1
) {
1316 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_min_u32
, dst
, true);
1317 } else if (dst
.regClass() == s1
) {
1318 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_min_u32
, dst
, true);
1320 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1321 nir_print_instr(&instr
->instr
, stderr
);
1322 fprintf(stderr
, "\n");
1327 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
1328 emit_boolean_logic(ctx
, instr
, Builder::s_or
, dst
);
1329 } else if (dst
.regClass() == v1
) {
1330 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_or_b32
, dst
, true);
1331 } else if (dst
.regClass() == v2
) {
1332 emit_vop2_instruction_logic64(ctx
, instr
, aco_opcode::v_or_b32
, dst
);
1333 } else if (dst
.regClass() == s1
) {
1334 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_or_b32
, dst
, true);
1335 } else if (dst
.regClass() == s2
) {
1336 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_or_b64
, dst
, true);
1338 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1339 nir_print_instr(&instr
->instr
, stderr
);
1340 fprintf(stderr
, "\n");
1345 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
1346 emit_boolean_logic(ctx
, instr
, Builder::s_and
, dst
);
1347 } else if (dst
.regClass() == v1
) {
1348 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_and_b32
, dst
, true);
1349 } else if (dst
.regClass() == v2
) {
1350 emit_vop2_instruction_logic64(ctx
, instr
, aco_opcode::v_and_b32
, dst
);
1351 } else if (dst
.regClass() == s1
) {
1352 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_and_b32
, dst
, true);
1353 } else if (dst
.regClass() == s2
) {
1354 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_and_b64
, dst
, true);
1356 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1357 nir_print_instr(&instr
->instr
, stderr
);
1358 fprintf(stderr
, "\n");
1363 if (instr
->dest
.dest
.ssa
.bit_size
== 1) {
1364 emit_boolean_logic(ctx
, instr
, Builder::s_xor
, dst
);
1365 } else if (dst
.regClass() == v1
) {
1366 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_xor_b32
, dst
, true);
1367 } else if (dst
.regClass() == v2
) {
1368 emit_vop2_instruction_logic64(ctx
, instr
, aco_opcode::v_xor_b32
, dst
);
1369 } else if (dst
.regClass() == s1
) {
1370 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_xor_b32
, dst
, true);
1371 } else if (dst
.regClass() == s2
) {
1372 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_xor_b64
, dst
, true);
1374 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1375 nir_print_instr(&instr
->instr
, stderr
);
1376 fprintf(stderr
, "\n");
1381 if (dst
.regClass() == v1
) {
1382 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_lshrrev_b32
, dst
, false, true);
1383 } else if (dst
.regClass() == v2
&& ctx
->program
->chip_class
>= GFX8
) {
1384 bld
.vop3(aco_opcode::v_lshrrev_b64
, Definition(dst
),
1385 get_alu_src(ctx
, instr
->src
[1]), get_alu_src(ctx
, instr
->src
[0]));
1386 } else if (dst
.regClass() == v2
) {
1387 bld
.vop3(aco_opcode::v_lshr_b64
, Definition(dst
),
1388 get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1389 } else if (dst
.regClass() == s2
) {
1390 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_lshr_b64
, dst
, true);
1391 } else if (dst
.regClass() == s1
) {
1392 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_lshr_b32
, dst
, true);
1394 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1395 nir_print_instr(&instr
->instr
, stderr
);
1396 fprintf(stderr
, "\n");
1401 if (dst
.regClass() == v1
) {
1402 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_lshlrev_b32
, dst
, false, true);
1403 } else if (dst
.regClass() == v2
&& ctx
->program
->chip_class
>= GFX8
) {
1404 bld
.vop3(aco_opcode::v_lshlrev_b64
, Definition(dst
),
1405 get_alu_src(ctx
, instr
->src
[1]), get_alu_src(ctx
, instr
->src
[0]));
1406 } else if (dst
.regClass() == v2
) {
1407 bld
.vop3(aco_opcode::v_lshl_b64
, Definition(dst
),
1408 get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1409 } else if (dst
.regClass() == s1
) {
1410 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_lshl_b32
, dst
, true);
1411 } else if (dst
.regClass() == s2
) {
1412 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_lshl_b64
, dst
, true);
1414 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1415 nir_print_instr(&instr
->instr
, stderr
);
1416 fprintf(stderr
, "\n");
1421 if (dst
.regClass() == v1
) {
1422 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_ashrrev_i32
, dst
, false, true);
1423 } else if (dst
.regClass() == v2
&& ctx
->program
->chip_class
>= GFX8
) {
1424 bld
.vop3(aco_opcode::v_ashrrev_i64
, Definition(dst
),
1425 get_alu_src(ctx
, instr
->src
[1]), get_alu_src(ctx
, instr
->src
[0]));
1426 } else if (dst
.regClass() == v2
) {
1427 bld
.vop3(aco_opcode::v_ashr_i64
, Definition(dst
),
1428 get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1429 } else if (dst
.regClass() == s1
) {
1430 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_ashr_i32
, dst
, true);
1431 } else if (dst
.regClass() == s2
) {
1432 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_ashr_i64
, dst
, true);
1434 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1435 nir_print_instr(&instr
->instr
, stderr
);
1436 fprintf(stderr
, "\n");
1440 case nir_op_find_lsb
: {
1441 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1442 if (src
.regClass() == s1
) {
1443 bld
.sop1(aco_opcode::s_ff1_i32_b32
, Definition(dst
), src
);
1444 } else if (src
.regClass() == v1
) {
1445 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_ffbl_b32
, dst
);
1446 } else if (src
.regClass() == s2
) {
1447 bld
.sop1(aco_opcode::s_ff1_i32_b64
, Definition(dst
), src
);
1449 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1450 nir_print_instr(&instr
->instr
, stderr
);
1451 fprintf(stderr
, "\n");
1455 case nir_op_ufind_msb
:
1456 case nir_op_ifind_msb
: {
1457 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1458 if (src
.regClass() == s1
|| src
.regClass() == s2
) {
1459 aco_opcode op
= src
.regClass() == s2
?
1460 (instr
->op
== nir_op_ufind_msb
? aco_opcode::s_flbit_i32_b64
: aco_opcode::s_flbit_i32_i64
) :
1461 (instr
->op
== nir_op_ufind_msb
? aco_opcode::s_flbit_i32_b32
: aco_opcode::s_flbit_i32
);
1462 Temp msb_rev
= bld
.sop1(op
, bld
.def(s1
), src
);
1464 Builder::Result sub
= bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.def(s1
, scc
),
1465 Operand(src
.size() * 32u - 1u), msb_rev
);
1466 Temp msb
= sub
.def(0).getTemp();
1467 Temp carry
= sub
.def(1).getTemp();
1469 bld
.sop2(aco_opcode::s_cselect_b32
, Definition(dst
), Operand((uint32_t)-1), msb
, bld
.scc(carry
));
1470 } else if (src
.regClass() == v1
) {
1471 aco_opcode op
= instr
->op
== nir_op_ufind_msb
? aco_opcode::v_ffbh_u32
: aco_opcode::v_ffbh_i32
;
1472 Temp msb_rev
= bld
.tmp(v1
);
1473 emit_vop1_instruction(ctx
, instr
, op
, msb_rev
);
1474 Temp msb
= bld
.tmp(v1
);
1475 Temp carry
= bld
.vsub32(Definition(msb
), Operand(31u), Operand(msb_rev
), true).def(1).getTemp();
1476 bld
.vop2_e64(aco_opcode::v_cndmask_b32
, Definition(dst
), msb
, Operand((uint32_t)-1), carry
);
1478 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1479 nir_print_instr(&instr
->instr
, stderr
);
1480 fprintf(stderr
, "\n");
1484 case nir_op_bitfield_reverse
: {
1485 if (dst
.regClass() == s1
) {
1486 bld
.sop1(aco_opcode::s_brev_b32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]));
1487 } else if (dst
.regClass() == v1
) {
1488 bld
.vop1(aco_opcode::v_bfrev_b32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]));
1490 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1491 nir_print_instr(&instr
->instr
, stderr
);
1492 fprintf(stderr
, "\n");
1497 if (dst
.regClass() == s1
) {
1498 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_add_u32
, dst
, true);
1502 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1503 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1504 if (dst
.regClass() == v1
) {
1505 bld
.vadd32(Definition(dst
), Operand(src0
), Operand(src1
));
1509 assert(src0
.size() == 2 && src1
.size() == 2);
1510 Temp src00
= bld
.tmp(src0
.type(), 1);
1511 Temp src01
= bld
.tmp(dst
.type(), 1);
1512 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
1513 Temp src10
= bld
.tmp(src1
.type(), 1);
1514 Temp src11
= bld
.tmp(dst
.type(), 1);
1515 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
1517 if (dst
.regClass() == s2
) {
1518 Temp carry
= bld
.tmp(s1
);
1519 Temp dst0
= bld
.sop2(aco_opcode::s_add_u32
, bld
.def(s1
), bld
.scc(Definition(carry
)), src00
, src10
);
1520 Temp dst1
= bld
.sop2(aco_opcode::s_addc_u32
, bld
.def(s1
), bld
.def(s1
, scc
), src01
, src11
, bld
.scc(carry
));
1521 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
1522 } else if (dst
.regClass() == v2
) {
1523 Temp dst0
= bld
.tmp(v1
);
1524 Temp carry
= bld
.vadd32(Definition(dst0
), src00
, src10
, true).def(1).getTemp();
1525 Temp dst1
= bld
.vadd32(bld
.def(v1
), src01
, src11
, false, carry
);
1526 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
1528 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1529 nir_print_instr(&instr
->instr
, stderr
);
1530 fprintf(stderr
, "\n");
1534 case nir_op_uadd_sat
: {
1535 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1536 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1537 if (dst
.regClass() == s1
) {
1538 Temp tmp
= bld
.tmp(s1
), carry
= bld
.tmp(s1
);
1539 bld
.sop2(aco_opcode::s_add_u32
, Definition(tmp
), bld
.scc(Definition(carry
)),
1541 bld
.sop2(aco_opcode::s_cselect_b32
, Definition(dst
), Operand((uint32_t) -1), tmp
, bld
.scc(carry
));
1542 } else if (dst
.regClass() == v1
) {
1543 if (ctx
->options
->chip_class
>= GFX9
) {
1544 aco_ptr
<VOP3A_instruction
> add
{create_instruction
<VOP3A_instruction
>(aco_opcode::v_add_u32
, asVOP3(Format::VOP2
), 2, 1)};
1545 add
->operands
[0] = Operand(src0
);
1546 add
->operands
[1] = Operand(src1
);
1547 add
->definitions
[0] = Definition(dst
);
1549 ctx
->block
->instructions
.emplace_back(std::move(add
));
1551 if (src1
.regClass() != v1
)
1552 std::swap(src0
, src1
);
1553 assert(src1
.regClass() == v1
);
1554 Temp tmp
= bld
.tmp(v1
);
1555 Temp carry
= bld
.vadd32(Definition(tmp
), src0
, src1
, true).def(1).getTemp();
1556 bld
.vop2_e64(aco_opcode::v_cndmask_b32
, Definition(dst
), tmp
, Operand((uint32_t) -1), carry
);
1559 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1560 nir_print_instr(&instr
->instr
, stderr
);
1561 fprintf(stderr
, "\n");
1565 case nir_op_uadd_carry
: {
1566 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1567 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1568 if (dst
.regClass() == s1
) {
1569 bld
.sop2(aco_opcode::s_add_u32
, bld
.def(s1
), bld
.scc(Definition(dst
)), src0
, src1
);
1572 if (dst
.regClass() == v1
) {
1573 Temp carry
= bld
.vadd32(bld
.def(v1
), src0
, src1
, true).def(1).getTemp();
1574 bld
.vop2_e64(aco_opcode::v_cndmask_b32
, Definition(dst
), Operand(0u), Operand(1u), carry
);
1578 Temp src00
= bld
.tmp(src0
.type(), 1);
1579 Temp src01
= bld
.tmp(dst
.type(), 1);
1580 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
1581 Temp src10
= bld
.tmp(src1
.type(), 1);
1582 Temp src11
= bld
.tmp(dst
.type(), 1);
1583 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
1584 if (dst
.regClass() == s2
) {
1585 Temp carry
= bld
.tmp(s1
);
1586 bld
.sop2(aco_opcode::s_add_u32
, bld
.def(s1
), bld
.scc(Definition(carry
)), src00
, src10
);
1587 carry
= bld
.sop2(aco_opcode::s_addc_u32
, bld
.def(s1
), bld
.scc(bld
.def(s1
)), src01
, src11
, bld
.scc(carry
)).def(1).getTemp();
1588 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), carry
, Operand(0u));
1589 } else if (dst
.regClass() == v2
) {
1590 Temp carry
= bld
.vadd32(bld
.def(v1
), src00
, src10
, true).def(1).getTemp();
1591 carry
= bld
.vadd32(bld
.def(v1
), src01
, src11
, true, carry
).def(1).getTemp();
1592 carry
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(0u), Operand(1u), carry
);
1593 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), carry
, Operand(0u));
1595 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1596 nir_print_instr(&instr
->instr
, stderr
);
1597 fprintf(stderr
, "\n");
1602 if (dst
.regClass() == s1
) {
1603 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_sub_i32
, dst
, true);
1607 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1608 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1609 if (dst
.regClass() == v1
) {
1610 bld
.vsub32(Definition(dst
), src0
, src1
);
1614 Temp src00
= bld
.tmp(src0
.type(), 1);
1615 Temp src01
= bld
.tmp(dst
.type(), 1);
1616 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
1617 Temp src10
= bld
.tmp(src1
.type(), 1);
1618 Temp src11
= bld
.tmp(dst
.type(), 1);
1619 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
1620 if (dst
.regClass() == s2
) {
1621 Temp carry
= bld
.tmp(s1
);
1622 Temp dst0
= bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(carry
)), src00
, src10
);
1623 Temp dst1
= bld
.sop2(aco_opcode::s_subb_u32
, bld
.def(s1
), bld
.def(s1
, scc
), src01
, src11
, carry
);
1624 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
1625 } else if (dst
.regClass() == v2
) {
1626 Temp lower
= bld
.tmp(v1
);
1627 Temp borrow
= bld
.vsub32(Definition(lower
), src00
, src10
, true).def(1).getTemp();
1628 Temp upper
= bld
.vsub32(bld
.def(v1
), src01
, src11
, false, borrow
);
1629 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1631 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1632 nir_print_instr(&instr
->instr
, stderr
);
1633 fprintf(stderr
, "\n");
1637 case nir_op_usub_borrow
: {
1638 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1639 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1640 if (dst
.regClass() == s1
) {
1641 bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(dst
)), src0
, src1
);
1643 } else if (dst
.regClass() == v1
) {
1644 Temp borrow
= bld
.vsub32(bld
.def(v1
), src0
, src1
, true).def(1).getTemp();
1645 bld
.vop2_e64(aco_opcode::v_cndmask_b32
, Definition(dst
), Operand(0u), Operand(1u), borrow
);
1649 Temp src00
= bld
.tmp(src0
.type(), 1);
1650 Temp src01
= bld
.tmp(dst
.type(), 1);
1651 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src00
), Definition(src01
), src0
);
1652 Temp src10
= bld
.tmp(src1
.type(), 1);
1653 Temp src11
= bld
.tmp(dst
.type(), 1);
1654 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src10
), Definition(src11
), src1
);
1655 if (dst
.regClass() == s2
) {
1656 Temp borrow
= bld
.tmp(s1
);
1657 bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(borrow
)), src00
, src10
);
1658 borrow
= bld
.sop2(aco_opcode::s_subb_u32
, bld
.def(s1
), bld
.scc(bld
.def(s1
)), src01
, src11
, bld
.scc(borrow
)).def(1).getTemp();
1659 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), borrow
, Operand(0u));
1660 } else if (dst
.regClass() == v2
) {
1661 Temp borrow
= bld
.vsub32(bld
.def(v1
), src00
, src10
, true).def(1).getTemp();
1662 borrow
= bld
.vsub32(bld
.def(v1
), src01
, src11
, true, Operand(borrow
)).def(1).getTemp();
1663 borrow
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(0u), Operand(1u), borrow
);
1664 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), borrow
, Operand(0u));
1666 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1667 nir_print_instr(&instr
->instr
, stderr
);
1668 fprintf(stderr
, "\n");
1673 if (dst
.regClass() == v1
) {
1674 bld
.vop3(aco_opcode::v_mul_lo_u32
, Definition(dst
),
1675 get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1676 } else if (dst
.regClass() == s1
) {
1677 emit_sop2_instruction(ctx
, instr
, aco_opcode::s_mul_i32
, dst
, false);
1679 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1680 nir_print_instr(&instr
->instr
, stderr
);
1681 fprintf(stderr
, "\n");
1685 case nir_op_umul_high
: {
1686 if (dst
.regClass() == v1
) {
1687 bld
.vop3(aco_opcode::v_mul_hi_u32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1688 } else if (dst
.regClass() == s1
&& ctx
->options
->chip_class
>= GFX9
) {
1689 bld
.sop2(aco_opcode::s_mul_hi_u32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1690 } else if (dst
.regClass() == s1
) {
1691 Temp tmp
= bld
.vop3(aco_opcode::v_mul_hi_u32
, bld
.def(v1
), get_alu_src(ctx
, instr
->src
[0]),
1692 as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1])));
1693 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), tmp
);
1695 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1696 nir_print_instr(&instr
->instr
, stderr
);
1697 fprintf(stderr
, "\n");
1701 case nir_op_imul_high
: {
1702 if (dst
.regClass() == v1
) {
1703 bld
.vop3(aco_opcode::v_mul_hi_i32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1704 } else if (dst
.regClass() == s1
&& ctx
->options
->chip_class
>= GFX9
) {
1705 bld
.sop2(aco_opcode::s_mul_hi_i32
, Definition(dst
), get_alu_src(ctx
, instr
->src
[0]), get_alu_src(ctx
, instr
->src
[1]));
1706 } else if (dst
.regClass() == s1
) {
1707 Temp tmp
= bld
.vop3(aco_opcode::v_mul_hi_i32
, bld
.def(v1
), get_alu_src(ctx
, instr
->src
[0]),
1708 as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1])));
1709 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
), tmp
);
1711 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1712 nir_print_instr(&instr
->instr
, stderr
);
1713 fprintf(stderr
, "\n");
1718 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1719 Temp src1
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1]));
1720 if (dst
.regClass() == v2b
) {
1721 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_mul_f16
, dst
, true);
1722 } else if (dst
.regClass() == v1
) {
1723 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_mul_f32
, dst
, true);
1724 } else if (dst
.regClass() == v2
) {
1725 bld
.vop3(aco_opcode::v_mul_f64
, Definition(dst
), src0
, src1
);
1727 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1728 nir_print_instr(&instr
->instr
, stderr
);
1729 fprintf(stderr
, "\n");
1734 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1735 Temp src1
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1]));
1736 if (dst
.regClass() == v2b
) {
1737 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_add_f16
, dst
, true);
1738 } else if (dst
.regClass() == v1
) {
1739 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_add_f32
, dst
, true);
1740 } else if (dst
.regClass() == v2
) {
1741 bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), src0
, src1
);
1743 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1744 nir_print_instr(&instr
->instr
, stderr
);
1745 fprintf(stderr
, "\n");
1750 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1751 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
1752 if (dst
.regClass() == v2b
) {
1753 if (src1
.type() == RegType::vgpr
|| src0
.type() != RegType::vgpr
)
1754 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_sub_f16
, dst
, false);
1756 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_subrev_f16
, dst
, true);
1757 } else if (dst
.regClass() == v1
) {
1758 if (src1
.type() == RegType::vgpr
|| src0
.type() != RegType::vgpr
)
1759 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_sub_f32
, dst
, false);
1761 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_subrev_f32
, dst
, true);
1762 } else if (dst
.regClass() == v2
) {
1763 Instruction
* add
= bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
),
1764 as_vgpr(ctx
, src0
), as_vgpr(ctx
, src1
));
1765 VOP3A_instruction
* sub
= static_cast<VOP3A_instruction
*>(add
);
1768 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1769 nir_print_instr(&instr
->instr
, stderr
);
1770 fprintf(stderr
, "\n");
1775 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1776 Temp src1
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1]));
1777 if (dst
.regClass() == v2b
) {
1778 // TODO: check fp_mode.must_flush_denorms16_64
1779 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_max_f16
, dst
, true);
1780 } else if (dst
.regClass() == v1
) {
1781 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_max_f32
, dst
, true, false, ctx
->block
->fp_mode
.must_flush_denorms32
);
1782 } else if (dst
.regClass() == v2
) {
1783 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
&& ctx
->program
->chip_class
< GFX9
) {
1784 Temp tmp
= bld
.vop3(aco_opcode::v_max_f64
, bld
.def(v2
), src0
, src1
);
1785 bld
.vop3(aco_opcode::v_mul_f64
, Definition(dst
), Operand(0x3FF0000000000000lu
), tmp
);
1787 bld
.vop3(aco_opcode::v_max_f64
, Definition(dst
), src0
, src1
);
1790 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1791 nir_print_instr(&instr
->instr
, stderr
);
1792 fprintf(stderr
, "\n");
1797 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
1798 Temp src1
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[1]));
1799 if (dst
.regClass() == v2b
) {
1800 // TODO: check fp_mode.must_flush_denorms16_64
1801 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_min_f16
, dst
, true);
1802 } else if (dst
.regClass() == v1
) {
1803 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_min_f32
, dst
, true, false, ctx
->block
->fp_mode
.must_flush_denorms32
);
1804 } else if (dst
.regClass() == v2
) {
1805 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
&& ctx
->program
->chip_class
< GFX9
) {
1806 Temp tmp
= bld
.vop3(aco_opcode::v_min_f64
, bld
.def(v2
), src0
, src1
);
1807 bld
.vop3(aco_opcode::v_mul_f64
, Definition(dst
), Operand(0x3FF0000000000000lu
), tmp
);
1809 bld
.vop3(aco_opcode::v_min_f64
, Definition(dst
), src0
, src1
);
1812 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1813 nir_print_instr(&instr
->instr
, stderr
);
1814 fprintf(stderr
, "\n");
1818 case nir_op_fmax3
: {
1819 if (dst
.regClass() == v2b
) {
1820 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_max3_f16
, dst
, false);
1821 } else if (dst
.regClass() == v1
) {
1822 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_max3_f32
, dst
, ctx
->block
->fp_mode
.must_flush_denorms32
);
1824 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1825 nir_print_instr(&instr
->instr
, stderr
);
1826 fprintf(stderr
, "\n");
1830 case nir_op_fmin3
: {
1831 if (dst
.regClass() == v2b
) {
1832 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_min3_f16
, dst
, false);
1833 } else if (dst
.regClass() == v1
) {
1834 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_min3_f32
, dst
, ctx
->block
->fp_mode
.must_flush_denorms32
);
1836 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1837 nir_print_instr(&instr
->instr
, stderr
);
1838 fprintf(stderr
, "\n");
1842 case nir_op_fmed3
: {
1843 if (dst
.regClass() == v2b
) {
1844 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_med3_f16
, dst
, false);
1845 } else if (dst
.regClass() == v1
) {
1846 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_med3_f32
, dst
, ctx
->block
->fp_mode
.must_flush_denorms32
);
1848 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1849 nir_print_instr(&instr
->instr
, stderr
);
1850 fprintf(stderr
, "\n");
1854 case nir_op_umax3
: {
1855 if (dst
.size() == 1) {
1856 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_max3_u32
, dst
);
1858 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1859 nir_print_instr(&instr
->instr
, stderr
);
1860 fprintf(stderr
, "\n");
1864 case nir_op_umin3
: {
1865 if (dst
.size() == 1) {
1866 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_min3_u32
, dst
);
1868 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1869 nir_print_instr(&instr
->instr
, stderr
);
1870 fprintf(stderr
, "\n");
1874 case nir_op_umed3
: {
1875 if (dst
.size() == 1) {
1876 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_med3_u32
, dst
);
1878 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1879 nir_print_instr(&instr
->instr
, stderr
);
1880 fprintf(stderr
, "\n");
1884 case nir_op_imax3
: {
1885 if (dst
.size() == 1) {
1886 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_max3_i32
, dst
);
1888 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1889 nir_print_instr(&instr
->instr
, stderr
);
1890 fprintf(stderr
, "\n");
1894 case nir_op_imin3
: {
1895 if (dst
.size() == 1) {
1896 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_min3_i32
, dst
);
1898 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1899 nir_print_instr(&instr
->instr
, stderr
);
1900 fprintf(stderr
, "\n");
1904 case nir_op_imed3
: {
1905 if (dst
.size() == 1) {
1906 emit_vop3a_instruction(ctx
, instr
, aco_opcode::v_med3_i32
, dst
);
1908 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1909 nir_print_instr(&instr
->instr
, stderr
);
1910 fprintf(stderr
, "\n");
1914 case nir_op_cube_face_coord
: {
1915 Temp in
= get_alu_src(ctx
, instr
->src
[0], 3);
1916 Temp src
[3] = { emit_extract_vector(ctx
, in
, 0, v1
),
1917 emit_extract_vector(ctx
, in
, 1, v1
),
1918 emit_extract_vector(ctx
, in
, 2, v1
) };
1919 Temp ma
= bld
.vop3(aco_opcode::v_cubema_f32
, bld
.def(v1
), src
[0], src
[1], src
[2]);
1920 ma
= bld
.vop1(aco_opcode::v_rcp_f32
, bld
.def(v1
), ma
);
1921 Temp sc
= bld
.vop3(aco_opcode::v_cubesc_f32
, bld
.def(v1
), src
[0], src
[1], src
[2]);
1922 Temp tc
= bld
.vop3(aco_opcode::v_cubetc_f32
, bld
.def(v1
), src
[0], src
[1], src
[2]);
1923 sc
= bld
.vop2(aco_opcode::v_madak_f32
, bld
.def(v1
), sc
, ma
, Operand(0x3f000000u
/*0.5*/));
1924 tc
= bld
.vop2(aco_opcode::v_madak_f32
, bld
.def(v1
), tc
, ma
, Operand(0x3f000000u
/*0.5*/));
1925 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), sc
, tc
);
1928 case nir_op_cube_face_index
: {
1929 Temp in
= get_alu_src(ctx
, instr
->src
[0], 3);
1930 Temp src
[3] = { emit_extract_vector(ctx
, in
, 0, v1
),
1931 emit_extract_vector(ctx
, in
, 1, v1
),
1932 emit_extract_vector(ctx
, in
, 2, v1
) };
1933 bld
.vop3(aco_opcode::v_cubeid_f32
, Definition(dst
), src
[0], src
[1], src
[2]);
1936 case nir_op_bcsel
: {
1937 emit_bcsel(ctx
, instr
, dst
);
1941 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1942 if (dst
.regClass() == v2b
) {
1943 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rsq_f16
, dst
);
1944 } else if (dst
.regClass() == v1
) {
1945 emit_rsq(ctx
, bld
, Definition(dst
), src
);
1946 } else if (dst
.regClass() == v2
) {
1947 /* Lowered at NIR level for precision reasons. */
1948 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rsq_f64
, dst
);
1950 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1951 nir_print_instr(&instr
->instr
, stderr
);
1952 fprintf(stderr
, "\n");
1957 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1958 if (dst
.regClass() == v2b
) {
1959 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
)
1960 src
= bld
.vop2(aco_opcode::v_mul_f16
, bld
.def(v2b
), Operand((uint16_t)0x3C00), as_vgpr(ctx
, src
));
1961 bld
.vop2(aco_opcode::v_xor_b32
, Definition(dst
), Operand(0x8000u
), as_vgpr(ctx
, src
));
1962 } else if (dst
.regClass() == v1
) {
1963 if (ctx
->block
->fp_mode
.must_flush_denorms32
)
1964 src
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), Operand(0x3f800000u
), as_vgpr(ctx
, src
));
1965 bld
.vop2(aco_opcode::v_xor_b32
, Definition(dst
), Operand(0x80000000u
), as_vgpr(ctx
, src
));
1966 } else if (dst
.regClass() == v2
) {
1967 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
)
1968 src
= bld
.vop3(aco_opcode::v_mul_f64
, bld
.def(v2
), Operand(0x3FF0000000000000lu
), as_vgpr(ctx
, src
));
1969 Temp upper
= bld
.tmp(v1
), lower
= bld
.tmp(v1
);
1970 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), src
);
1971 upper
= bld
.vop2(aco_opcode::v_xor_b32
, bld
.def(v1
), Operand(0x80000000u
), upper
);
1972 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1974 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1975 nir_print_instr(&instr
->instr
, stderr
);
1976 fprintf(stderr
, "\n");
1981 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
1982 if (dst
.regClass() == v2b
) {
1983 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
)
1984 src
= bld
.vop2(aco_opcode::v_mul_f16
, bld
.def(v2b
), Operand((uint16_t)0x3C00), as_vgpr(ctx
, src
));
1985 bld
.vop2(aco_opcode::v_and_b32
, Definition(dst
), Operand(0x7FFFu
), as_vgpr(ctx
, src
));
1986 } else if (dst
.regClass() == v1
) {
1987 if (ctx
->block
->fp_mode
.must_flush_denorms32
)
1988 src
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), Operand(0x3f800000u
), as_vgpr(ctx
, src
));
1989 bld
.vop2(aco_opcode::v_and_b32
, Definition(dst
), Operand(0x7FFFFFFFu
), as_vgpr(ctx
, src
));
1990 } else if (dst
.regClass() == v2
) {
1991 if (ctx
->block
->fp_mode
.must_flush_denorms16_64
)
1992 src
= bld
.vop3(aco_opcode::v_mul_f64
, bld
.def(v2
), Operand(0x3FF0000000000000lu
), as_vgpr(ctx
, src
));
1993 Temp upper
= bld
.tmp(v1
), lower
= bld
.tmp(v1
);
1994 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), src
);
1995 upper
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), Operand(0x7FFFFFFFu
), upper
);
1996 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
1998 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
1999 nir_print_instr(&instr
->instr
, stderr
);
2000 fprintf(stderr
, "\n");
2005 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2006 if (dst
.regClass() == v2b
) {
2007 bld
.vop3(aco_opcode::v_med3_f16
, Definition(dst
), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src
);
2008 } else if (dst
.regClass() == v1
) {
2009 bld
.vop3(aco_opcode::v_med3_f32
, Definition(dst
), Operand(0u), Operand(0x3f800000u
), src
);
2010 /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
2011 // TODO: confirm that this holds under any circumstances
2012 } else if (dst
.regClass() == v2
) {
2013 Instruction
* add
= bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), src
, Operand(0u));
2014 VOP3A_instruction
* vop3
= static_cast<VOP3A_instruction
*>(add
);
2017 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2018 nir_print_instr(&instr
->instr
, stderr
);
2019 fprintf(stderr
, "\n");
2023 case nir_op_flog2
: {
2024 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2025 if (dst
.regClass() == v2b
) {
2026 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_log_f16
, dst
);
2027 } else if (dst
.regClass() == v1
) {
2028 emit_log2(ctx
, bld
, Definition(dst
), src
);
2030 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2031 nir_print_instr(&instr
->instr
, stderr
);
2032 fprintf(stderr
, "\n");
2037 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2038 if (dst
.regClass() == v2b
) {
2039 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rcp_f16
, dst
);
2040 } else if (dst
.regClass() == v1
) {
2041 emit_rcp(ctx
, bld
, Definition(dst
), src
);
2042 } else if (dst
.regClass() == v2
) {
2043 /* Lowered at NIR level for precision reasons. */
2044 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rcp_f64
, dst
);
2046 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2047 nir_print_instr(&instr
->instr
, stderr
);
2048 fprintf(stderr
, "\n");
2052 case nir_op_fexp2
: {
2053 if (dst
.regClass() == v2b
) {
2054 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_exp_f16
, dst
);
2055 } else if (dst
.regClass() == v1
) {
2056 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_exp_f32
, dst
);
2058 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2059 nir_print_instr(&instr
->instr
, stderr
);
2060 fprintf(stderr
, "\n");
2064 case nir_op_fsqrt
: {
2065 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2066 if (dst
.regClass() == v2b
) {
2067 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_sqrt_f16
, dst
);
2068 } else if (dst
.regClass() == v1
) {
2069 emit_sqrt(ctx
, bld
, Definition(dst
), src
);
2070 } else if (dst
.regClass() == v2
) {
2071 /* Lowered at NIR level for precision reasons. */
2072 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_sqrt_f64
, dst
);
2074 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2075 nir_print_instr(&instr
->instr
, stderr
);
2076 fprintf(stderr
, "\n");
2080 case nir_op_ffract
: {
2081 if (dst
.regClass() == v2b
) {
2082 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_fract_f16
, dst
);
2083 } else if (dst
.regClass() == v1
) {
2084 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_fract_f32
, dst
);
2085 } else if (dst
.regClass() == v2
) {
2086 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_fract_f64
, dst
);
2088 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2089 nir_print_instr(&instr
->instr
, stderr
);
2090 fprintf(stderr
, "\n");
2094 case nir_op_ffloor
: {
2095 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2096 if (dst
.regClass() == v2b
) {
2097 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_floor_f16
, dst
);
2098 } else if (dst
.regClass() == v1
) {
2099 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_floor_f32
, dst
);
2100 } else if (dst
.regClass() == v2
) {
2101 emit_floor_f64(ctx
, bld
, Definition(dst
), src
);
2103 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2104 nir_print_instr(&instr
->instr
, stderr
);
2105 fprintf(stderr
, "\n");
2109 case nir_op_fceil
: {
2110 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
2111 if (dst
.regClass() == v2b
) {
2112 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_ceil_f16
, dst
);
2113 } else if (dst
.regClass() == v1
) {
2114 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_ceil_f32
, dst
);
2115 } else if (dst
.regClass() == v2
) {
2116 if (ctx
->options
->chip_class
>= GFX7
) {
2117 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_ceil_f64
, dst
);
2119 /* GFX6 doesn't support V_CEIL_F64, lower it. */
2120 /* trunc = trunc(src0)
2121 * if (src0 > 0.0 && src0 != trunc)
2124 Temp trunc
= emit_trunc_f64(ctx
, bld
, bld
.def(v2
), src0
);
2125 Temp tmp0
= bld
.vopc_e64(aco_opcode::v_cmp_gt_f64
, bld
.def(bld
.lm
), src0
, Operand(0u));
2126 Temp tmp1
= bld
.vopc(aco_opcode::v_cmp_lg_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), src0
, trunc
);
2127 Temp cond
= bld
.sop2(aco_opcode::s_and_b64
, bld
.hint_vcc(bld
.def(s2
)), bld
.def(s1
, scc
), tmp0
, tmp1
);
2128 Temp add
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), bld
.copy(bld
.def(v1
), Operand(0u)), bld
.copy(bld
.def(v1
), Operand(0x3ff00000u
)), cond
);
2129 add
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), bld
.copy(bld
.def(v1
), Operand(0u)), add
);
2130 bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), trunc
, add
);
2133 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2134 nir_print_instr(&instr
->instr
, stderr
);
2135 fprintf(stderr
, "\n");
2139 case nir_op_ftrunc
: {
2140 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2141 if (dst
.regClass() == v2b
) {
2142 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_trunc_f16
, dst
);
2143 } else if (dst
.regClass() == v1
) {
2144 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_trunc_f32
, dst
);
2145 } else if (dst
.regClass() == v2
) {
2146 emit_trunc_f64(ctx
, bld
, Definition(dst
), src
);
2148 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2149 nir_print_instr(&instr
->instr
, stderr
);
2150 fprintf(stderr
, "\n");
2154 case nir_op_fround_even
: {
2155 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
2156 if (dst
.regClass() == v2b
) {
2157 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rndne_f16
, dst
);
2158 } else if (dst
.regClass() == v1
) {
2159 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rndne_f32
, dst
);
2160 } else if (dst
.regClass() == v2
) {
2161 if (ctx
->options
->chip_class
>= GFX7
) {
2162 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_rndne_f64
, dst
);
2164 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2165 Temp src0_lo
= bld
.tmp(v1
), src0_hi
= bld
.tmp(v1
);
2166 bld
.pseudo(aco_opcode::p_split_vector
, Definition(src0_lo
), Definition(src0_hi
), src0
);
2168 Temp bitmask
= bld
.sop1(aco_opcode::s_brev_b32
, bld
.def(s1
), bld
.copy(bld
.def(s1
), Operand(-2u)));
2169 Temp bfi
= bld
.vop3(aco_opcode::v_bfi_b32
, bld
.def(v1
), bitmask
, bld
.copy(bld
.def(v1
), Operand(0x43300000u
)), as_vgpr(ctx
, src0_hi
));
2170 Temp tmp
= bld
.vop3(aco_opcode::v_add_f64
, bld
.def(v2
), src0
, bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(0u), bfi
));
2171 Instruction
*sub
= bld
.vop3(aco_opcode::v_add_f64
, bld
.def(v2
), tmp
, bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(0u), bfi
));
2172 static_cast<VOP3A_instruction
*>(sub
)->neg
[1] = true;
2173 tmp
= sub
->definitions
[0].getTemp();
2175 Temp v
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(-1u), Operand(0x432fffffu
));
2176 Instruction
* vop3
= bld
.vopc_e64(aco_opcode::v_cmp_gt_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), src0
, v
);
2177 static_cast<VOP3A_instruction
*>(vop3
)->abs
[0] = true;
2178 Temp cond
= vop3
->definitions
[0].getTemp();
2180 Temp tmp_lo
= bld
.tmp(v1
), tmp_hi
= bld
.tmp(v1
);
2181 bld
.pseudo(aco_opcode::p_split_vector
, Definition(tmp_lo
), Definition(tmp_hi
), tmp
);
2182 Temp dst0
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), tmp_lo
, as_vgpr(ctx
, src0_lo
), cond
);
2183 Temp dst1
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), tmp_hi
, as_vgpr(ctx
, src0_hi
), cond
);
2185 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), dst0
, dst1
);
2188 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2189 nir_print_instr(&instr
->instr
, stderr
);
2190 fprintf(stderr
, "\n");
2196 Temp src
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[0]));
2197 aco_ptr
<Instruction
> norm
;
2198 if (dst
.regClass() == v2b
) {
2199 Temp half_pi
= bld
.copy(bld
.def(s1
), Operand(0x3118u
));
2200 Temp tmp
= bld
.vop2(aco_opcode::v_mul_f16
, bld
.def(v1
), half_pi
, src
);
2201 aco_opcode opcode
= instr
->op
== nir_op_fsin
? aco_opcode::v_sin_f16
: aco_opcode::v_cos_f16
;
2202 bld
.vop1(opcode
, Definition(dst
), tmp
);
2203 } else if (dst
.regClass() == v1
) {
2204 Temp half_pi
= bld
.copy(bld
.def(s1
), Operand(0x3e22f983u
));
2205 Temp tmp
= bld
.vop2(aco_opcode::v_mul_f32
, bld
.def(v1
), half_pi
, src
);
2207 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2208 if (ctx
->options
->chip_class
< GFX9
)
2209 tmp
= bld
.vop1(aco_opcode::v_fract_f32
, bld
.def(v1
), tmp
);
2211 aco_opcode opcode
= instr
->op
== nir_op_fsin
? aco_opcode::v_sin_f32
: aco_opcode::v_cos_f32
;
2212 bld
.vop1(opcode
, Definition(dst
), tmp
);
2214 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2215 nir_print_instr(&instr
->instr
, stderr
);
2216 fprintf(stderr
, "\n");
2220 case nir_op_ldexp
: {
2221 Temp src0
= get_alu_src(ctx
, instr
->src
[0]);
2222 Temp src1
= get_alu_src(ctx
, instr
->src
[1]);
2223 if (dst
.regClass() == v2b
) {
2224 emit_vop2_instruction(ctx
, instr
, aco_opcode::v_ldexp_f16
, dst
, false);
2225 } else if (dst
.regClass() == v1
) {
2226 bld
.vop3(aco_opcode::v_ldexp_f32
, Definition(dst
), as_vgpr(ctx
, src0
), src1
);
2227 } else if (dst
.regClass() == v2
) {
2228 bld
.vop3(aco_opcode::v_ldexp_f64
, Definition(dst
), as_vgpr(ctx
, src0
), src1
);
2230 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2231 nir_print_instr(&instr
->instr
, stderr
);
2232 fprintf(stderr
, "\n");
2236 case nir_op_frexp_sig
: {
2237 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2238 if (dst
.regClass() == v2b
) {
2239 bld
.vop1(aco_opcode::v_frexp_mant_f16
, Definition(dst
), src
);
2240 } else if (dst
.regClass() == v1
) {
2241 bld
.vop1(aco_opcode::v_frexp_mant_f32
, Definition(dst
), src
);
2242 } else if (dst
.regClass() == v2
) {
2243 bld
.vop1(aco_opcode::v_frexp_mant_f64
, Definition(dst
), src
);
2245 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2246 nir_print_instr(&instr
->instr
, stderr
);
2247 fprintf(stderr
, "\n");
2251 case nir_op_frexp_exp
: {
2252 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2253 if (instr
->src
[0].src
.ssa
->bit_size
== 16) {
2254 Temp tmp
= bld
.vop1(aco_opcode::v_frexp_exp_i16_f16
, bld
.def(v1
), src
);
2255 tmp
= bld
.pseudo(aco_opcode::p_extract_vector
, bld
.def(v1b
), tmp
, Operand(0u));
2256 convert_int(ctx
, bld
, tmp
, 8, 32, true, dst
);
2257 } else if (instr
->src
[0].src
.ssa
->bit_size
== 32) {
2258 bld
.vop1(aco_opcode::v_frexp_exp_i32_f32
, Definition(dst
), src
);
2259 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2260 bld
.vop1(aco_opcode::v_frexp_exp_i32_f64
, Definition(dst
), src
);
2262 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2263 nir_print_instr(&instr
->instr
, stderr
);
2264 fprintf(stderr
, "\n");
2268 case nir_op_fsign
: {
2269 Temp src
= as_vgpr(ctx
, get_alu_src(ctx
, instr
->src
[0]));
2270 if (dst
.regClass() == v2b
) {
2271 Temp one
= bld
.copy(bld
.def(v1
), Operand(0x3c00u
));
2272 Temp minus_one
= bld
.copy(bld
.def(v1
), Operand(0xbc00u
));
2273 Temp cond
= bld
.vopc(aco_opcode::v_cmp_nlt_f16
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2274 src
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), one
, src
, cond
);
2275 cond
= bld
.vopc(aco_opcode::v_cmp_le_f16
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2276 bld
.vop2(aco_opcode::v_cndmask_b32
, Definition(dst
), minus_one
, src
, cond
);
2277 } else if (dst
.regClass() == v1
) {
2278 Temp cond
= bld
.vopc(aco_opcode::v_cmp_nlt_f32
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2279 src
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), Operand(0x3f800000u
), src
, cond
);
2280 cond
= bld
.vopc(aco_opcode::v_cmp_le_f32
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2281 bld
.vop2(aco_opcode::v_cndmask_b32
, Definition(dst
), Operand(0xbf800000u
), src
, cond
);
2282 } else if (dst
.regClass() == v2
) {
2283 Temp cond
= bld
.vopc(aco_opcode::v_cmp_nlt_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2284 Temp tmp
= bld
.vop1(aco_opcode::v_mov_b32
, bld
.def(v1
), Operand(0x3FF00000u
));
2285 Temp upper
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), tmp
, emit_extract_vector(ctx
, src
, 1, v1
), cond
);
2287 cond
= bld
.vopc(aco_opcode::v_cmp_le_f64
, bld
.hint_vcc(bld
.def(bld
.lm
)), Operand(0u), src
);
2288 tmp
= bld
.vop1(aco_opcode::v_mov_b32
, bld
.def(v1
), Operand(0xBFF00000u
));
2289 upper
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), tmp
, upper
, cond
);
2291 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), Operand(0u), upper
);
2293 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2294 nir_print_instr(&instr
->instr
, stderr
);
2295 fprintf(stderr
, "\n");
2300 case nir_op_f2f16_rtne
: {
2301 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2302 if (instr
->src
[0].src
.ssa
->bit_size
== 64)
2303 src
= bld
.vop1(aco_opcode::v_cvt_f32_f64
, bld
.def(v1
), src
);
2304 if (instr
->op
== nir_op_f2f16_rtne
&& ctx
->block
->fp_mode
.round16_64
!= fp_round_ne
)
2305 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2306 * keep value numbering and the scheduler simpler.
2308 bld
.vop1(aco_opcode::p_cvt_f16_f32_rtne
, Definition(dst
), src
);
2310 bld
.vop1(aco_opcode::v_cvt_f16_f32
, Definition(dst
), src
);
2313 case nir_op_f2f16_rtz
: {
2314 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2315 if (instr
->src
[0].src
.ssa
->bit_size
== 64)
2316 src
= bld
.vop1(aco_opcode::v_cvt_f32_f64
, bld
.def(v1
), src
);
2317 bld
.vop3(aco_opcode::v_cvt_pkrtz_f16_f32
, Definition(dst
), src
, Operand(0u));
2320 case nir_op_f2f32
: {
2321 if (instr
->src
[0].src
.ssa
->bit_size
== 16) {
2322 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_f32_f16
, dst
);
2323 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2324 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_f32_f64
, dst
);
2326 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2327 nir_print_instr(&instr
->instr
, stderr
);
2328 fprintf(stderr
, "\n");
2332 case nir_op_f2f64
: {
2333 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2334 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2335 src
= bld
.vop1(aco_opcode::v_cvt_f32_f16
, bld
.def(v1
), src
);
2336 bld
.vop1(aco_opcode::v_cvt_f64_f32
, Definition(dst
), src
);
2339 case nir_op_i2f16
: {
2340 assert(dst
.regClass() == v2b
);
2341 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2342 if (instr
->src
[0].src
.ssa
->bit_size
== 8)
2343 src
= convert_int(ctx
, bld
, src
, 8, 16, true);
2344 else if (instr
->src
[0].src
.ssa
->bit_size
== 64)
2345 src
= convert_int(ctx
, bld
, src
, 64, 32, false);
2346 bld
.vop1(aco_opcode::v_cvt_f16_i16
, Definition(dst
), src
);
2349 case nir_op_i2f32
: {
2350 assert(dst
.size() == 1);
2351 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2352 if (instr
->src
[0].src
.ssa
->bit_size
<= 16)
2353 src
= convert_int(ctx
, bld
, src
, instr
->src
[0].src
.ssa
->bit_size
, 32, true);
2354 bld
.vop1(aco_opcode::v_cvt_f32_i32
, Definition(dst
), src
);
2357 case nir_op_i2f64
: {
2358 if (instr
->src
[0].src
.ssa
->bit_size
<= 32) {
2359 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2360 if (instr
->src
[0].src
.ssa
->bit_size
<= 16)
2361 src
= convert_int(ctx
, bld
, src
, instr
->src
[0].src
.ssa
->bit_size
, 32, true);
2362 bld
.vop1(aco_opcode::v_cvt_f64_i32
, Definition(dst
), src
);
2363 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2364 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2365 RegClass rc
= RegClass(src
.type(), 1);
2366 Temp lower
= bld
.tmp(rc
), upper
= bld
.tmp(rc
);
2367 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), src
);
2368 lower
= bld
.vop1(aco_opcode::v_cvt_f64_u32
, bld
.def(v2
), lower
);
2369 upper
= bld
.vop1(aco_opcode::v_cvt_f64_i32
, bld
.def(v2
), upper
);
2370 upper
= bld
.vop3(aco_opcode::v_ldexp_f64
, bld
.def(v2
), upper
, Operand(32u));
2371 bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), lower
, upper
);
2374 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2375 nir_print_instr(&instr
->instr
, stderr
);
2376 fprintf(stderr
, "\n");
2380 case nir_op_u2f16
: {
2381 assert(dst
.regClass() == v2b
);
2382 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2383 if (instr
->src
[0].src
.ssa
->bit_size
== 8)
2384 src
= convert_int(ctx
, bld
, src
, 8, 16, false);
2385 else if (instr
->src
[0].src
.ssa
->bit_size
== 64)
2386 src
= convert_int(ctx
, bld
, src
, 64, 32, false);
2387 bld
.vop1(aco_opcode::v_cvt_f16_u16
, Definition(dst
), src
);
2390 case nir_op_u2f32
: {
2391 assert(dst
.size() == 1);
2392 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2393 if (instr
->src
[0].src
.ssa
->bit_size
== 8) {
2394 bld
.vop1(aco_opcode::v_cvt_f32_ubyte0
, Definition(dst
), src
);
2396 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2397 src
= convert_int(ctx
, bld
, src
, instr
->src
[0].src
.ssa
->bit_size
, 32, true);
2398 bld
.vop1(aco_opcode::v_cvt_f32_u32
, Definition(dst
), src
);
2402 case nir_op_u2f64
: {
2403 if (instr
->src
[0].src
.ssa
->bit_size
<= 32) {
2404 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2405 if (instr
->src
[0].src
.ssa
->bit_size
<= 16)
2406 src
= convert_int(ctx
, bld
, src
, instr
->src
[0].src
.ssa
->bit_size
, 32, false);
2407 bld
.vop1(aco_opcode::v_cvt_f64_u32
, Definition(dst
), src
);
2408 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2409 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2410 RegClass rc
= RegClass(src
.type(), 1);
2411 Temp lower
= bld
.tmp(rc
), upper
= bld
.tmp(rc
);
2412 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), src
);
2413 lower
= bld
.vop1(aco_opcode::v_cvt_f64_u32
, bld
.def(v2
), lower
);
2414 upper
= bld
.vop1(aco_opcode::v_cvt_f64_u32
, bld
.def(v2
), upper
);
2415 upper
= bld
.vop3(aco_opcode::v_ldexp_f64
, bld
.def(v2
), upper
, Operand(32u));
2416 bld
.vop3(aco_opcode::v_add_f64
, Definition(dst
), lower
, upper
);
2418 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2419 nir_print_instr(&instr
->instr
, stderr
);
2420 fprintf(stderr
, "\n");
2425 case nir_op_f2i16
: {
2426 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2427 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i16_f16
, dst
);
2428 else if (instr
->src
[0].src
.ssa
->bit_size
== 32)
2429 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i32_f32
, dst
);
2431 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i32_f64
, dst
);
2435 case nir_op_f2u16
: {
2436 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2437 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_u16_f16
, dst
);
2438 else if (instr
->src
[0].src
.ssa
->bit_size
== 32)
2439 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_u32_f32
, dst
);
2441 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_u32_f64
, dst
);
2444 case nir_op_f2i32
: {
2445 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2446 if (instr
->src
[0].src
.ssa
->bit_size
== 16) {
2447 Temp tmp
= bld
.vop1(aco_opcode::v_cvt_f32_f16
, bld
.def(v1
), src
);
2448 if (dst
.type() == RegType::vgpr
) {
2449 bld
.vop1(aco_opcode::v_cvt_i32_f32
, Definition(dst
), tmp
);
2451 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
),
2452 bld
.vop1(aco_opcode::v_cvt_i32_f32
, bld
.def(v1
), tmp
));
2454 } else if (instr
->src
[0].src
.ssa
->bit_size
== 32) {
2455 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i32_f32
, dst
);
2456 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2457 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_i32_f64
, dst
);
2459 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2460 nir_print_instr(&instr
->instr
, stderr
);
2461 fprintf(stderr
, "\n");
2465 case nir_op_f2u32
: {
2466 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2467 if (instr
->src
[0].src
.ssa
->bit_size
== 16) {
2468 Temp tmp
= bld
.vop1(aco_opcode::v_cvt_f32_f16
, bld
.def(v1
), src
);
2469 if (dst
.type() == RegType::vgpr
) {
2470 bld
.vop1(aco_opcode::v_cvt_u32_f32
, Definition(dst
), tmp
);
2472 bld
.pseudo(aco_opcode::p_as_uniform
, Definition(dst
),
2473 bld
.vop1(aco_opcode::v_cvt_u32_f32
, bld
.def(v1
), tmp
));
2475 } else if (instr
->src
[0].src
.ssa
->bit_size
== 32) {
2476 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_u32_f32
, dst
);
2477 } else if (instr
->src
[0].src
.ssa
->bit_size
== 64) {
2478 emit_vop1_instruction(ctx
, instr
, aco_opcode::v_cvt_u32_f64
, dst
);
2480 fprintf(stderr
, "Unimplemented NIR instr bit size: ");
2481 nir_print_instr(&instr
->instr
, stderr
);
2482 fprintf(stderr
, "\n");
2486 case nir_op_f2i64
: {
2487 Temp src
= get_alu_src(ctx
, instr
->src
[0]);
2488 if (instr
->src
[0].src
.ssa
->bit_size
== 16)
2489 src
= bld
.vop1(aco_opcode::v_cvt_f32_f16
, bld
.def(v1
), src
);
2491 if (instr
->src
[0].src
.ssa
->bit_size
<= 32 && dst
.type() == RegType::vgpr
) {
2492 Temp exponent
= bld
.vop1(aco_opcode::v_frexp_exp_i32_f32
, bld
.def(v1
), src
);
2493 exponent
= bld
.vop3(aco_opcode::v_med3_i32
, bld
.def(v1
), Operand(0x0u
), exponent
, Operand(64u));
2494 Temp mantissa
= bld
.vop2(aco_opcode::v_and_b32
, bld
.def(v1
), Operand(0x7fffffu
), src
);
2495 Temp sign
= bld
.vop2(aco_opcode::v_ashrrev_i32
, bld
.def(v1
), Operand(31u), src
);
2496 mantissa
= bld
.vop2(aco_opcode::v_or_b32
, bld
.def(v1
), Operand(0x800000u
), mantissa
);
2497 mantissa
= bld
.vop2(aco_opcode::v_lshlrev_b32
, bld
.def(v1
), Operand(7u), mantissa
);
2498 mantissa
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(v2
), Operand(0u), mantissa
);
2499 Temp new_exponent
= bld
.tmp(v1
);
2500 Temp borrow
= bld
.vsub32(Definition(new_exponent
), Operand(63u), exponent
, true).def(1).getTemp();
2501 if (ctx
->program
->chip_class
>= GFX8
)
2502 mantissa
= bld
.vop3(aco_opcode::v_lshrrev_b64
, bld
.def(v2
), new_exponent
, mantissa
);
2504 mantissa
= bld
.vop3(aco_opcode::v_lshr_b64
, bld
.def(v2
), mantissa
, new_exponent
);
2505 Temp saturate
= bld
.vop1(aco_opcode::v_bfrev_b32
, bld
.def(v1
), Operand(0xfffffffeu
));
2506 Temp lower
= bld
.tmp(v1
), upper
= bld
.tmp(v1
);
2507 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), mantissa
);
2508 lower
= bld
.vop2_e64(aco_opcode::v_cndmask_b32
, bld
.def(v1
), lower
, Operand(0xffffffffu
), borrow
);
2509 upper
= bld
.vop2(aco_opcode::v_cndmask_b32
, bld
.def(v1
), upper
, saturate
, borrow
);
2510 lower
= bld
.vop2(aco_opcode::v_xor_b32
, bld
.def(v1
), sign
, lower
);
2511 upper
= bld
.vop2(aco_opcode::v_xor_b32
, bld
.def(v1
), sign
, upper
);
2512 Temp new_lower
= bld
.tmp(v1
);
2513 borrow
= bld
.vsub32(Definition(new_lower
), lower
, sign
, true).def(1).getTemp();
2514 Temp new_upper
= bld
.vsub32(bld
.def(v1
), upper
, sign
, false, borrow
);
2515 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), new_lower
, new_upper
);
2517 } else if (instr
->src
[0].src
.ssa
->bit_size
<= 32 && dst
.type() == RegType::sgpr
) {
2518 if (src
.type() == RegType::vgpr
)
2519 src
= bld
.as_uniform(src
);
2520 Temp exponent
= bld
.sop2(aco_opcode::s_bfe_u32
, bld
.def(s1
), bld
.def(s1
, scc
), src
, Operand(0x80017u
));
2521 exponent
= bld
.sop2(aco_opcode::s_sub_i32
, bld
.def(s1
), bld
.def(s1
, scc
), exponent
, Operand(126u));
2522 exponent
= bld
.sop2(aco_opcode::s_max_i32
, bld
.def(s1
), bld
.def(s1
, scc
), Operand(0u), exponent
);
2523 exponent
= bld
.sop2(aco_opcode::s_min_i32
, bld
.def(s1
), bld
.def(s1
, scc
), Operand(64u), exponent
);
2524 Temp mantissa
= bld
.sop2(aco_opcode::s_and_b32
, bld
.def(s1
), bld
.def(s1
, scc
), Operand(0x7fffffu
), src
);
2525 Temp sign
= bld
.sop2(aco_opcode::s_ashr_i32
, bld
.def(s1
), bld
.def(s1
, scc
), src
, Operand(31u));
2526 mantissa
= bld
.sop2(aco_opcode::s_or_b32
, bld
.def(s1
), bld
.def(s1
, scc
), Operand(0x800000u
), mantissa
);
2527 mantissa
= bld
.sop2(aco_opcode::s_lshl_b32
, bld
.def(s1
), bld
.def(s1
, scc
), mantissa
, Operand(7u));
2528 mantissa
= bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(s2
), Operand(0u), mantissa
);
2529 exponent
= bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.def(s1
, scc
), Operand(63u), exponent
);
2530 mantissa
= bld
.sop2(aco_opcode::s_lshr_b64
, bld
.def(s2
), bld
.def(s1
, scc
), mantissa
, exponent
);
2531 Temp cond
= bld
.sopc(aco_opcode::s_cmp_eq_u32
, bld
.def(s1
, scc
), exponent
, Operand(0xffffffffu
)); // exp >= 64
2532 Temp saturate
= bld
.sop1(aco_opcode::s_brev_b64
, bld
.def(s2
), Operand(0xfffffffeu
));
2533 mantissa
= bld
.sop2(aco_opcode::s_cselect_b64
, bld
.def(s2
), saturate
, mantissa
, cond
);
2534 Temp lower
= bld
.tmp(s1
), upper
= bld
.tmp(s1
);
2535 bld
.pseudo(aco_opcode::p_split_vector
, Definition(lower
), Definition(upper
), mantissa
);
2536 lower
= bld
.sop2(aco_opcode::s_xor_b32
, bld
.def(s1
), bld
.def(s1
, scc
), sign
, lower
);
2537 upper
= bld
.sop2(aco_opcode::s_xor_b32
, bld
.def(s1
), bld
.def(s1
, scc
), sign
, upper
);
2538 Temp borrow
= bld
.tmp(s1
);
2539 lower
= bld
.sop2(aco_opcode::s_sub_u32
, bld
.def(s1
), bld
.scc(Definition(borrow
)), lower
, sign
);
2540 upper
= bld
.sop2(aco_opcode::s_subb_u32
, bld
.def(s1
), bld
.def(s1
, scc
), upper
, sign
, borrow
);
2541 bld
.pseudo(aco_opcode::p_create_vector
, Definition(dst
), lower
, upper
);
2543 } else if (instr
->src
[0