* the same qword.
* (...)"
*
- * This means that 32-bit to 64-bit conversions need to have the 32-bit
- * data elements aligned to 64-bit. This restriction does not apply to
- * BDW and later.
+ * This means that conversions from bit-sizes smaller than 64-bit to
+ * 64-bit need to have the source data elements aligned to 64-bit.
+ * This restriction does not apply to BDW and later.
*/
if (nir_dest_bit_size(instr->dest.dest) == 64 &&
- nir_src_bit_size(instr->src[0].src) == 32 &&
+ nir_src_bit_size(instr->src[0].src) < 64 &&
(devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
fs_reg tmp = bld.vgrf(result.type, 1);
tmp = subscript(tmp, op[0].type, 0);
case nir_op_feq:
case nir_op_fne: {
fs_reg dest = result;
- if (nir_src_bit_size(instr->src[0].src) > 32) {
- dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
- }
+
+ const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ if (bit_size != 32)
+ dest = bld.vgrf(op[0].type, 1);
+
brw_conditional_mod cond;
switch (instr->op) {
case nir_op_flt:
default:
unreachable("bad opcode");
}
+
bld.CMP(dest, op[0], op[1], cond);
- if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+ if (bit_size > 32) {
bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+ } else if(bit_size < 32) {
+ /* When we convert the result to 32-bit we need to be careful and do
+ * it as a signed conversion to get sign extension (for 32-bit true)
+ */
+ const brw_reg_type src_type =
+ brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+ bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
}
break;
}
case nir_op_ieq:
case nir_op_ine: {
fs_reg dest = result;
- if (nir_src_bit_size(instr->src[0].src) > 32) {
- dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
- }
+
+ const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ if (bit_size != 32)
+ dest = bld.vgrf(op[0].type, 1);
brw_conditional_mod cond;
switch (instr->op) {
unreachable("bad opcode");
}
bld.CMP(dest, op[0], op[1], cond);
- if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+ if (bit_size > 32) {
bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+ } else if (bit_size < 32) {
+ /* When we convert the result to 32-bit we need to be careful and do
+ * it as a signed conversion to get sign extension (for 32-bit true)
+ */
+ const brw_reg_type src_type =
+ brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+ bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
}
break;
}
break;
case nir_op_pack_64_2x32_split:
+ case nir_op_pack_32_2x16_split:
bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
break;
break;
}
+ case nir_op_unpack_32_2x16_split_x:
+ case nir_op_unpack_32_2x16_split_y: {
+ if (instr->op == nir_op_unpack_32_2x16_split_x)
+ bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
+ else
+ bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
+ break;
+ }
+
case nir_op_fpow:
inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
inst->saturate = instr->dest.saturate;
fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
switch (instr->def.bit_size) {
+ case 16:
+ for (unsigned i = 0; i < instr->def.num_components; i++)
+ bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value.i16[i]));
+ break;
+
case 32:
for (unsigned i = 0; i < instr->def.num_components; i++)
bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
}
if (type_sz(dst.type) == 8) {
- shuffle_32bit_load_result_to_64bit_data(
- bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
-
- for (unsigned c = 0; c < num_components; c++)
- bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+ shuffle_from_32bit_read(bld,
+ offset(dst, bld, iter * 2),
+ retype(tmp_dst, BRW_REGISTER_TYPE_D),
+ 0,
+ num_components);
}
if (num_iterations > 1) {
1 /* dims */,
num_components_32bit,
BRW_PREDICATE_NONE);
- shuffle_32bit_load_result_to_16bit_data(bld,
- retype(dest, BRW_REGISTER_TYPE_W),
- retype(read_result, BRW_REGISTER_TYPE_D),
- first_component, num_components);
+ shuffle_from_32bit_read(bld, dest, read_result, first_component,
+ num_components);
} else {
fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
for (unsigned i = 0; i < num_components; i++) {
BRW_PREDICATE_NONE);
/* Shuffle the 32-bit load result into valid 64-bit data */
- const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
- shuffle_32bit_load_result_to_64bit_data(
- bld, packed_result, read_result, iter_components);
-
- /* Move each component to its destination */
- read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
- for (int c = 0; c < iter_components; c++) {
- bld.MOV(offset(dest, bld, it * 2 + c),
- offset(packed_result, bld, c));
- }
+ shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
+ read_result, 0, iter_components);
bld.ADD(read_offset, read_offset, brw_imm_ud(16));
}
case nir_intrinsic_load_input: {
/* load_input is only used for flat inputs */
unsigned base = nir_intrinsic_base(instr);
- unsigned component = nir_intrinsic_component(instr);
+ unsigned comp = nir_intrinsic_component(instr);
unsigned num_components = instr->num_components;
enum brw_reg_type type = dest.type;
/* Special case fields in the VUE header */
if (base == VARYING_SLOT_LAYER)
- component = 1;
+ comp = 1;
else if (base == VARYING_SLOT_VIEWPORT)
- component = 2;
+ comp = 2;
if (nir_dest_bit_size(instr->dest) == 64) {
/* const_index is in 32-bit type size units that could not be aligned
}
for (unsigned int i = 0; i < num_components; i++) {
- struct brw_reg interp = interp_reg(base, component + i);
- interp = suboffset(interp, 3);
- bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
- retype(fs_reg(interp), type));
+ bld.MOV(offset(retype(dest, type), bld, i),
+ retype(component(interp_reg(base, comp + i), 3), type));
}
if (nir_dest_bit_size(instr->dest) == 64) {
for (unsigned int i = 0; i < instr->num_components; i++) {
fs_reg interp =
- fs_reg(interp_reg(nir_intrinsic_base(instr),
- nir_intrinsic_component(instr) + i));
+ component(interp_reg(nir_intrinsic_base(instr),
+ nir_intrinsic_component(instr) + i), 0);
interp.type = BRW_REGISTER_TYPE_F;
dest.type = BRW_REGISTER_TYPE_F;
* aligned. Shuffling only one component would be the same as
* striding it.
*/
- fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D,
- DIV_ROUND_UP(num_components, 2));
- shuffle_16bit_data_for_32bit_write(bld, tmp, write_src,
- num_components);
- write_src = tmp;
+ write_src = shuffle_for_32bit_write(bld, write_src, 0,
+ num_components);
}
fs_reg offset_reg;
break;
}
+ case nir_intrinsic_begin_invocation_interlock: {
+ const fs_builder ubld = bld.group(8, 0);
+ const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+
+ ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
+ REG_SIZE;
+
+ break;
+ }
+
+ case nir_intrinsic_end_invocation_interlock: {
+ /* We don't need to do anything here */
+ break;
+ }
+
default:
unreachable("unknown intrinsic");
}
}
}
-void
-shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld,
- const fs_reg &dst,
- const fs_reg &src,
- uint32_t first_component,
- uint32_t components)
-{
- assert(type_sz(src.type) == 4);
- assert(type_sz(dst.type) == 2);
-
- /* A temporary is used to un-shuffle the 32-bit data of each component in
- * into a valid 16-bit vector. We can't write directly to dst because it
- * can be the same register as src and in that case the first MOV in the
- * loop below would overwrite the data read in the second MOV.
- */
- fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
-
- for (unsigned i = 0; i < components; i++) {
- const fs_reg component_i =
- subscript(offset(src, bld, (first_component + i) / 2), dst.type,
- (first_component + i) % 2);
-
- bld.MOV(offset(tmp, bld, i % 2), component_i);
-
- if (i % 2) {
- bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
- bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
- }
- }
- if (components % 2) {
- bld.MOV(offset(dst, bld, components - 1), tmp);
- }
-}
-
/**
* This helper does the inverse operation of
* SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
return dst;
}
+/*
+ * This helper takes a source register and un/shuffles it into the destination
+ * register.
+ *
+ * If source type size is smaller than destination type size the operation
+ * needed is a component shuffle. The opposite case would be an unshuffle. If
+ * source/destination type size is equal a shuffle is done that would be
+ * equivalent to a simple MOV.
+ *
+ * For example, if source is a 16-bit type and destination is 32-bit. A 3
+ * components .xyz 16-bit vector on SIMD8 would be.
+ *
+ * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
+ * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |
+ *
+ * This helper will return the following 2 32-bit components with the 16-bit
+ * values shuffled:
+ *
+ * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
+ * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |
+ *
+ * For unshuffle, the example would be the opposite, a 64-bit type source
+ * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
+ * would be:
+ *
+ * | x1l x1h | x2l x2h | x3l x3h | x4l x4h |
+ * | x5l x5h | x6l x6h | x7l x7h | x8l x8h |
+ * | y1l y1h | y2l y2h | y3l y3h | y4l y4h |
+ * | y5l y5h | y6l y6h | y7l y7h | y8l y8h |
+ *
+ * The returned result would be the following 4 32-bit components unshuffled:
+ *
+ * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
+ * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
+ * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
+ * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
+ *
+ * - Source and destination register must not be overlapped.
+ * - components units are measured in terms of the smaller type between
+ * source and destination because we are un/shuffling the smaller
+ * components from/into the bigger ones.
+ * - first_component parameter allows skipping source components.
+ */
void
-shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
- const fs_reg &dst,
- const fs_reg &src,
- uint32_t components)
+shuffle_src_to_dst(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t first_component,
+ uint32_t components)
+{
+ if (type_sz(src.type) == type_sz(dst.type)) {
+ assert(!regions_overlap(dst,
+ type_sz(dst.type) * bld.dispatch_width() * components,
+ offset(src, bld, first_component),
+ type_sz(src.type) * bld.dispatch_width() * components));
+ for (unsigned i = 0; i < components; i++) {
+ bld.MOV(retype(offset(dst, bld, i), src.type),
+ offset(src, bld, i + first_component));
+ }
+ } else if (type_sz(src.type) < type_sz(dst.type)) {
+ /* Source is shuffled into destination */
+ unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
+ assert(!regions_overlap(dst,
+ type_sz(dst.type) * bld.dispatch_width() *
+ DIV_ROUND_UP(components, size_ratio),
+ offset(src, bld, first_component),
+ type_sz(src.type) * bld.dispatch_width() * components));
+
+ brw_reg_type shuffle_type =
+ brw_reg_type_from_bit_size(8 * type_sz(src.type),
+ BRW_REGISTER_TYPE_D);
+ for (unsigned i = 0; i < components; i++) {
+ fs_reg shuffle_component_i =
+ subscript(offset(dst, bld, i / size_ratio),
+ shuffle_type, i % size_ratio);
+ bld.MOV(shuffle_component_i,
+ retype(offset(src, bld, i + first_component), shuffle_type));
+ }
+ } else {
+ /* Source is unshuffled into destination */
+ unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
+ assert(!regions_overlap(dst,
+ type_sz(dst.type) * bld.dispatch_width() * components,
+ offset(src, bld, first_component / size_ratio),
+ type_sz(src.type) * bld.dispatch_width() *
+ DIV_ROUND_UP(components + (first_component % size_ratio),
+ size_ratio)));
+
+ brw_reg_type shuffle_type =
+ brw_reg_type_from_bit_size(8 * type_sz(dst.type),
+ BRW_REGISTER_TYPE_D);
+ for (unsigned i = 0; i < components; i++) {
+ fs_reg shuffle_component_i =
+ subscript(offset(src, bld, (first_component + i) / size_ratio),
+ shuffle_type, (first_component + i) % size_ratio);
+ bld.MOV(retype(offset(dst, bld, i), shuffle_type),
+ shuffle_component_i);
+ }
+ }
+}
+
+void
+shuffle_from_32bit_read(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t first_component,
+ uint32_t components)
{
- assert(type_sz(src.type) == 2);
- assert(type_sz(dst.type) == 4);
+ assert(type_sz(src.type) == 4);
- /* A temporary is used to shuffle the 16-bit data of each component in the
- * 32-bit data vector. We can't write directly to dst because it can be the
- * same register as src and in that case the first MOV in the loop below
- * would overwrite the data read in the second MOV.
+ /* This function takes components in units of the destination type while
+ * shuffle_src_to_dst takes components in units of the smallest type
*/
- fs_reg tmp = bld.vgrf(dst.type);
-
- for (unsigned i = 0; i < components; i++) {
- const fs_reg component_i = offset(src, bld, i);
- bld.MOV(subscript(tmp, src.type, i % 2), component_i);
- if (i % 2) {
- bld.MOV(offset(dst, bld, i / 2), tmp);
- }
+ if (type_sz(dst.type) > 4) {
+ assert(type_sz(dst.type) == 8);
+ first_component *= 2;
+ components *= 2;
}
- if (components % 2) {
- bld.MOV(offset(dst, bld, components / 2), tmp);
+
+ shuffle_src_to_dst(bld, dst, src, first_component, components);
+}
+
+fs_reg
+shuffle_for_32bit_write(const fs_builder &bld,
+ const fs_reg &src,
+ uint32_t first_component,
+ uint32_t components)
+{
+ fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
+ DIV_ROUND_UP (components * type_sz(src.type), 4));
+ /* This function takes components in units of the source type while
+ * shuffle_src_to_dst takes components in units of the smallest type
+ */
+ if (type_sz(src.type) > 4) {
+ assert(type_sz(src.type) == 8);
+ first_component *= 2;
+ components *= 2;
}
+
+ shuffle_src_to_dst(bld, dst, src, first_component, components);
+
+ return dst;
}
fs_reg