+ if (ctx->options->chip_class <= GFX7) {
+ /* GFX6-7: there is no bpermute instruction */
+ Operand index_op(index);
+ Operand input_data(data);
+ index_op.setLateKill(true);
+ input_data.setLateKill(true);
+
+ return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
+ } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
+ /* GFX10 wave64 mode: emulate full-wave bpermute */
+ if (!ctx->has_gfx10_wave64_bpermute) {
+ ctx->has_gfx10_wave64_bpermute = true;
+ ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
+ ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
+ }
+
+ Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
+ Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
+ Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
+ Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
+ Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
+ Operand input_data(data);
+
+ index_x4.setLateKill(true);
+ input_data.setLateKill(true);
+ same_half.setLateKill(true);
+
+ return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
+ } else {
+ /* GFX8-9 or GFX10 wave32: bpermute works normally */
+ Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);