bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
else
unreachable("wrong src register class for nir_op_imov");
- } else if (dst.regClass() == v1) {
- bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
- } else if (dst.regClass() == v2) {
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
} else {
- nir_print_instr(&instr->instr, stderr);
- unreachable("Should have been lowered to scalar.");
+ if (dst.regClass() == v1)
+ bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
+ else if (dst.regClass() == v1b ||
+ dst.regClass() == v2b ||
+ dst.regClass() == v2)
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
+ else
+ unreachable("wrong src register class for nir_op_imov");
}
break;
}
bool allow_combining = true, bool reorder = true, bool slc = false)
{
Builder bld(ctx->program, ctx->block);
- assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
assert(write_mask);
write_mask = widen_mask(write_mask, elem_size_bytes);
unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
{
- assert(elem_size_bytes == 4 || elem_size_bytes == 8);
- assert((num_components * elem_size_bytes / 4) == dst.size());
+ assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert((num_components * elem_size_bytes) == dst.bytes());
assert(!!stride != allow_combining);
Builder bld(ctx->program, ctx->block);
Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
Builder bld(ctx->program, ctx->block);
- Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
- if (ctx->program->has_16bank_lds)
- interp_p1.instr->operands[0].setLateKill(true);
- bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component);
+
+ if (dst.regClass() == v2b) {
+ if (ctx->program->has_16bank_lds) {
+ assert(ctx->options->chip_class <= GFX8);
+ Builder::Result interp_p1 =
+ bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
+ Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
+ interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
+ coord1, bld.m0(prim_mask), interp_p1, idx, component);
+ bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
+ bld.m0(prim_mask), interp_p1, idx, component);
+ } else {
+ aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
+
+ if (ctx->options->chip_class == GFX8)
+ interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
+
+ Builder::Result interp_p1 =
+ bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
+ coord1, bld.m0(prim_mask), idx, component);
+ bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
+ interp_p1, idx, component);
+ }
+ } else {
+ Builder::Result interp_p1 =
+ bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
+ bld.m0(prim_mask), idx, component);
+
+ if (ctx->program->has_16bank_lds)
+ interp_p1.instr->operands[0].setLateKill(true);
+
+ bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
+ bld.m0(prim_mask), interp_p1, idx, component);
+ }
}
void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
unsigned component = nir_intrinsic_component(instr);
+ unsigned bitsize = instr->dest.ssa.bit_size;
unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
/* load channels */
while (channel_start < num_channels) {
- unsigned fetch_size = num_channels - channel_start;
+ unsigned fetch_component = num_channels - channel_start;
unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
bool expanded = false;
vtx_info->chan_byte_size == 4;
unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
if (!use_mubuf) {
- fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
+ fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
} else {
- if (fetch_size == 3 && ctx->options->chip_class == GFX6) {
+ if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
/* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
- fetch_size = 4;
+ fetch_component = 4;
expanded = true;
}
}
+ unsigned fetch_bytes = fetch_component * bitsize / 8;
+
Temp fetch_index = index;
if (attrib_stride != 0 && fetch_offset > attrib_stride) {
fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
}
aco_opcode opcode;
- switch (fetch_size) {
- case 1:
- opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
- break;
+ switch (fetch_bytes) {
case 2:
- opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+ assert(!use_mubuf && bitsize == 16);
+ opcode = aco_opcode::tbuffer_load_format_d16_x;
+ break;
+ case 4:
+ if (bitsize == 16) {
+ assert(!use_mubuf);
+ opcode = aco_opcode::tbuffer_load_format_d16_xy;
+ } else {
+ opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
+ }
break;
- case 3:
+ case 6:
+ assert(!use_mubuf && bitsize == 16);
+ opcode = aco_opcode::tbuffer_load_format_d16_xyz;
+ break;
+ case 8:
+ if (bitsize == 16) {
+ assert(!use_mubuf);
+ opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
+ } else {
+ opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+ }
+ break;
+ case 12:
assert(ctx->options->chip_class >= GFX7 ||
(!use_mubuf && ctx->options->chip_class == GFX6));
opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
break;
- case 4:
+ case 16:
opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
break;
default:
}
Temp fetch_dst;
- if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
+ if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
!expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
num_channels <= 3)) {
direct_fetch = true;
fetch_dst = dst;
} else {
- fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
+ fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
}
if (use_mubuf) {
emit_split_vector(ctx, fetch_dst, fetch_dst.size());
- if (fetch_size == 1) {
+ if (fetch_component == 1) {
channels[channel_start] = fetch_dst;
} else {
- for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
- channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
+ for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
+ channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
+ bitsize == 16 ? v2b : v1);
}
- channel_start += fetch_size;
+ channel_start += fetch_component;
}
if (!direct_fetch) {
case nir_intrinsic_read_first_invocation: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- if (src.regClass() == v1) {
+ if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
emit_wqm(ctx,
bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
dst);
} else if (cluster_size == 1) {
bld.copy(Definition(dst), src);
} else {
- src = as_vgpr(ctx, src);
+ unsigned bit_size = instr->src[0].ssa->bit_size;
+
+ src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
ReduceOp reduce_op;
switch (op) {
- #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
- CASE(iadd)
- CASE(imul)
- CASE(fadd)
- CASE(fmul)
- CASE(imin)
- CASE(umin)
- CASE(fmin)
- CASE(imax)
- CASE(umax)
- CASE(fmax)
- CASE(iand)
- CASE(ior)
- CASE(ixor)
+ #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
+ #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
+ CASEI(iadd)
+ CASEI(imul)
+ CASEI(imin)
+ CASEI(umin)
+ CASEI(imax)
+ CASEI(umax)
+ CASEI(iand)
+ CASEI(ior)
+ CASEI(ixor)
+ CASEF(fadd)
+ CASEF(fmul)
+ CASEF(fmin)
+ CASEF(fmax)
default:
unreachable("unknown reduction op");
- #undef CASE
+ #undef CASEI
+ #undef CASEF
}
aco_opcode aco_op;