nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_vertex_id:
- unreachable("should be lowered by lower_vertex_id().");
+ case nir_intrinsic_load_base_vertex:
+ unreachable("should be lowered by nir_lower_system_values().");
case nir_intrinsic_load_vertex_id_zero_base:
- case nir_intrinsic_load_base_vertex:
+ case nir_intrinsic_load_is_indexed_draw:
+ case nir_intrinsic_load_first_vertex:
case nir_intrinsic_load_instance_id:
case nir_intrinsic_load_base_instance:
case nir_intrinsic_load_draw_id:
{
const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
- reg = abld.vgrf(BRW_REGISTER_TYPE_W);
+ reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
const fs_builder allbld8 = abld.group(8, 0).exec_all();
allbld8.MOV(reg, brw_imm_v(0x76543210));
case 32:
return BRW_REGISTER_TYPE_D;
case 64:
- return BRW_REGISTER_TYPE_DF;
+ return BRW_REGISTER_TYPE_Q;
default:
unreachable("Invalid bit size");
}
case 32:
return BRW_REGISTER_TYPE_UD;
case 64:
- return BRW_REGISTER_TYPE_DF;
+ return BRW_REGISTER_TYPE_UQ;
default:
unreachable("Invalid bit size");
}
inst->src[0].negate = true;
}
+static brw_rnd_mode
+brw_rnd_mode_from_nir_op (const nir_op op) {
+ switch (op) {
+ case nir_op_f2f16_rtz:
+ return BRW_RND_MODE_RTZ;
+ case nir_op_f2f16_rtne:
+ return BRW_RND_MODE_RTNE;
+ default:
+ unreachable("Operation doesn't support rounding mode");
+ }
+}
+
void
fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
{
inst->saturate = instr->dest.saturate;
break;
+ case nir_op_f2f16_rtne:
+ case nir_op_f2f16_rtz:
+ bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
+ brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
+ /* fallthrough */
+
+ /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
+ * on the HW gen, it is a special hw opcode or just a MOV, and
+ * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
+ *
+ * But if we want to use that opcode, we need to provide support on
+ * different optimizations and lowerings. As right now HF support is
+ * only for gen8+, it will be better to use directly the MOV, and use
+ * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
+ */
+
+ case nir_op_f2f16_undef:
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
case nir_op_f2f64:
+ case nir_op_f2i64:
+ case nir_op_f2u64:
case nir_op_i2f64:
+ case nir_op_i2i64:
case nir_op_u2f64:
+ case nir_op_u2u64:
/* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
*
* "When source or destination is 64b (...), regioning in Align1
* the same qword.
* (...)"
*
- * This means that 32-bit to 64-bit conversions need to have the 32-bit
- * data elements aligned to 64-bit. This restriction does not apply to
- * BDW and later.
+ * This means that conversions from bit-sizes smaller than 64-bit to
+ * 64-bit need to have the source data elements aligned to 64-bit.
+ * This restriction does not apply to BDW and later.
*/
if (nir_dest_bit_size(instr->dest.dest) == 64 &&
- nir_src_bit_size(instr->src[0].src) == 32 &&
+ nir_src_bit_size(instr->src[0].src) < 64 &&
(devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
fs_reg tmp = bld.vgrf(result.type, 1);
tmp = subscript(tmp, op[0].type, 0);
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
- case nir_op_f2i64:
- case nir_op_f2u64:
+ case nir_op_f2i16:
+ case nir_op_f2u16:
case nir_op_i2i32:
- case nir_op_i2i64:
case nir_op_u2u32:
- case nir_op_u2u64:
+ case nir_op_i2i16:
+ case nir_op_u2u16:
+ case nir_op_i2f16:
+ case nir_op_u2f16:
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
break;
}
- case nir_op_isign:
+ case nir_op_isign: {
/* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
* -> non-negative val generates 0x00000000.
* Predicated OR sets 1 if val is positive.
*/
- assert(nir_dest_bit_size(instr->dest.dest) < 64);
- bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
- bld.ASR(result, op[0], brw_imm_d(31));
- inst = bld.OR(result, result, brw_imm_d(1));
+ uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
+ assert(bit_size == 32 || bit_size == 16);
+
+ fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
+ fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
+ fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
+
+ bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
+ bld.ASR(result, op[0], shift);
+ inst = bld.OR(result, result, one);
inst->predicate = BRW_PREDICATE_NORMAL;
break;
+ }
case nir_op_frcp:
inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
case nir_op_feq:
case nir_op_fne: {
fs_reg dest = result;
- if (nir_src_bit_size(instr->src[0].src) > 32) {
- dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
- }
+
+ const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ if (bit_size != 32)
+ dest = bld.vgrf(op[0].type, 1);
+
brw_conditional_mod cond;
switch (instr->op) {
case nir_op_flt:
default:
unreachable("bad opcode");
}
+
bld.CMP(dest, op[0], op[1], cond);
- if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+ if (bit_size > 32) {
bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+ } else if(bit_size < 32) {
+ /* When we convert the result to 32-bit we need to be careful and do
+ * it as a signed conversion to get sign extension (for 32-bit true)
+ */
+ const brw_reg_type src_type =
+ brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+ bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
}
break;
}
case nir_op_ieq:
case nir_op_ine: {
fs_reg dest = result;
- if (nir_src_bit_size(instr->src[0].src) > 32) {
- dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
- }
+
+ const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ if (bit_size != 32)
+ dest = bld.vgrf(op[0].type, 1);
brw_conditional_mod cond;
switch (instr->op) {
unreachable("bad opcode");
}
bld.CMP(dest, op[0], op[1], cond);
- if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+ if (bit_size > 32) {
bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+ } else if (bit_size < 32) {
+ /* When we convert the result to 32-bit we need to be careful and do
+ * it as a signed conversion to get sign extension (for 32-bit true)
+ */
+ const brw_reg_type src_type =
+ brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+ bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
}
break;
}
break;
case nir_op_i2b:
- case nir_op_f2b:
- if (nir_src_bit_size(instr->src[0].src) == 64) {
+ case nir_op_f2b: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ if (bit_size == 64) {
/* two-argument instructions can't take 64-bit immediates */
fs_reg zero;
fs_reg tmp;
if (instr->op == nir_op_f2b) {
zero = vgrf(glsl_type::double_type);
tmp = vgrf(glsl_type::double_type);
+ bld.MOV(zero, setup_imm_df(bld, 0.0));
} else {
zero = vgrf(glsl_type::int64_t_type);
tmp = vgrf(glsl_type::int64_t_type);
+ bld.MOV(zero, brw_imm_q(0));
}
- bld.MOV(zero, setup_imm_df(bld, 0.0));
/* A SIMD16 execution needs to be split in two instructions, so use
* a vgrf instead of the flag register as dst so instruction splitting
* works
bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
} else {
- if (instr->op == nir_op_f2b) {
- bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+ fs_reg zero;
+ if (bit_size == 32) {
+ zero = instr->op == nir_op_f2b ? brw_imm_f(0.0f) : brw_imm_d(0);
} else {
- bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
+ assert(bit_size == 16);
+ zero = instr->op == nir_op_f2b ?
+ retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
}
+ bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
}
break;
+ }
case nir_op_ftrunc:
inst = bld.RNDZ(result, op[0]);
break;
case nir_op_pack_64_2x32_split:
+ case nir_op_pack_32_2x16_split:
bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
break;
break;
}
+ case nir_op_unpack_32_2x16_split_x:
+ case nir_op_unpack_32_2x16_split_y: {
+ if (instr->op == nir_op_unpack_32_2x16_split_x)
+ bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
+ else
+ bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
+ break;
+ }
+
case nir_op_fpow:
inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
inst->saturate = instr->dest.saturate;
case nir_op_extract_u8:
case nir_op_extract_i8: {
- const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
assert(byte != NULL);
- bld.MOV(result, subscript(op[0], type, byte->u32[0]));
+
+ /* The PRMs say:
+ *
+ * BDW+
+ * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
+ * Use two instructions and a word or DWord intermediate integer type.
+ */
+ if (nir_dest_bit_size(instr->dest.dest) == 64) {
+ const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8);
+
+ if (instr->op == nir_op_extract_i8) {
+ /* If we need to sign extend, extract to a word first */
+ fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
+ bld.MOV(w_temp, subscript(op[0], type, byte->u32[0]));
+ bld.MOV(result, w_temp);
+ } else {
+ /* Otherwise use an AND with 0xff and a word type */
+ bld.AND(result, subscript(op[0], type, byte->u32[0] / 2), brw_imm_uw(0xff));
+ }
+ } else {
+ const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
+ bld.MOV(result, subscript(op[0], type, byte->u32[0]));
+ }
break;
}
fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
switch (instr->def.bit_size) {
+ case 16:
+ for (unsigned i = 0; i < instr->def.num_components; i++)
+ bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value.i16[i]));
+ break;
+
case 32:
for (unsigned i = 0; i < instr->def.num_components; i++)
bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
break;
case 64:
- for (unsigned i = 0; i < instr->def.num_components; i++)
- bld.MOV(offset(reg, bld, i),
- setup_imm_df(bld, instr->value.f64[i]));
+ assert(devinfo->gen >= 7);
+ if (devinfo->gen == 7) {
+ /* We don't get 64-bit integer types until gen8 */
+ for (unsigned i = 0; i < instr->def.num_components; i++) {
+ bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
+ setup_imm_df(bld, instr->value.f64[i]));
+ }
+ } else {
+ for (unsigned i = 0; i < instr->def.num_components; i++)
+ bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value.i64[i]));
+ }
break;
default:
get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
{
switch (op) {
- case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_var_atomic_add:
return BRW_AOP_ADD;
- case nir_intrinsic_image_atomic_min:
+ case nir_intrinsic_image_var_atomic_min:
return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
BRW_AOP_IMIN : BRW_AOP_UMIN);
- case nir_intrinsic_image_atomic_max:
+ case nir_intrinsic_image_var_atomic_max:
return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
BRW_AOP_IMAX : BRW_AOP_UMAX);
- case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_var_atomic_and:
return BRW_AOP_AND;
- case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_var_atomic_or:
return BRW_AOP_OR;
- case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_var_atomic_xor:
return BRW_AOP_XOR;
- case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_var_atomic_exchange:
return BRW_AOP_MOV;
- case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_image_var_atomic_comp_swap:
return BRW_AOP_CMPWR;
default:
unreachable("Not reachable.");
* by 32 (shifting by 5), and add the two together. This is
* the final indirect byte offset.
*/
- fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
+ fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
}
if (type_sz(dst.type) == 8) {
- shuffle_32bit_load_result_to_64bit_data(
- bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
-
- for (unsigned c = 0; c < num_components; c++)
- bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+ shuffle_from_32bit_read(bld,
+ offset(dst, bld, iter * 2),
+ retype(tmp_dst, BRW_REGISTER_TYPE_D),
+ 0,
+ num_components);
}
if (num_iterations > 1) {
const fs_reg offset_reg,
unsigned num_components)
{
- if (type_sz(dest.type) == 4) {
+ if (type_sz(dest.type) <= 2) {
+ assert(dest.stride == 1);
+ boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;
+
+ if (is_const_offset) {
+ uint32_t start = offset_reg.ud & ~3;
+ uint32_t end = offset_reg.ud + num_components * type_sz(dest.type);
+ end = ALIGN(end, 4);
+ assert (end - start <= 16);
+
+ /* At this point we have 16-bit component/s that have constant
+ * offset aligned to 4-bytes that can be read with untyped_reads.
+ * untyped_read message requires 32-bit aligned offsets.
+ */
+ unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type);
+ unsigned num_components_32bit = (end - start) / 4;
+
+ fs_reg read_result =
+ emit_untyped_read(bld, surf_index, brw_imm_ud(start),
+ 1 /* dims */,
+ num_components_32bit,
+ BRW_PREDICATE_NONE);
+ shuffle_from_32bit_read(bld, dest, read_result, first_component,
+ num_components);
+ } else {
+ fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ for (unsigned i = 0; i < num_components; i++) {
+ if (i == 0) {
+ bld.MOV(read_offset, offset_reg);
+ } else {
+ bld.ADD(read_offset, offset_reg,
+ brw_imm_ud(i * type_sz(dest.type)));
+ }
+ /* Non constant offsets are not guaranteed to be aligned 32-bits
+ * so they are read using one byte_scattered_read message
+ * for each component.
+ */
+ fs_reg read_result =
+ emit_byte_scattered_read(bld, surf_index, read_offset,
+ 1 /* dims */, 1,
+ type_sz(dest.type) * 8 /* bit_size */,
+ BRW_PREDICATE_NONE);
+ bld.MOV(offset(dest, bld, i),
+ subscript (read_result, dest.type, 0));
+ }
+ }
+ } else if (type_sz(dest.type) == 4) {
fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
1 /* dims */,
num_components,
BRW_PREDICATE_NONE);
/* Shuffle the 32-bit load result into valid 64-bit data */
- const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
- shuffle_32bit_load_result_to_64bit_data(
- bld, packed_result, read_result, iter_components);
-
- /* Move each component to its destination */
- read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
- for (int c = 0; c < iter_components; c++) {
- bld.MOV(offset(dest, bld, it * 2 + c),
- offset(packed_result, bld, c));
- }
+ shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
+ read_result, 0, iter_components);
bld.ADD(read_offset, read_offset, brw_imm_ud(16));
}
switch (instr->intrinsic) {
case nir_intrinsic_load_vertex_id:
- unreachable("should be lowered by lower_vertex_id()");
+ case nir_intrinsic_load_base_vertex:
+ unreachable("should be lowered by nir_lower_system_values()");
case nir_intrinsic_load_vertex_id_zero_base:
- case nir_intrinsic_load_base_vertex:
case nir_intrinsic_load_instance_id:
case nir_intrinsic_load_base_instance:
case nir_intrinsic_load_draw_id: {
fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
unsigned first_component = nir_intrinsic_component(instr);
unsigned num_components = instr->num_components;
- enum brw_reg_type type = dest.type;
nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
assert(const_offset && "Indirect input loads not allowed");
src = offset(src, bld, const_offset->u32[0]);
+ if (type_sz(dest.type) == 8)
+ first_component /= 2;
+
for (unsigned j = 0; j < num_components; j++) {
bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
}
- if (type == BRW_REGISTER_TYPE_DF) {
- /* Once the double vector is read, set again its original register
- * type to continue with normal execution.
- */
- src = retype(src, type);
- dest = retype(dest, type);
- }
-
- if (type_sz(src.type) == 8) {
+ if (type_sz(dest.type) == 8) {
shuffle_32bit_load_result_to_64bit_data(bld,
dest,
retype(dest, BRW_REGISTER_TYPE_F),
break;
}
+ case nir_intrinsic_load_first_vertex:
+ case nir_intrinsic_load_is_indexed_draw:
+ unreachable("lowered by brw_nir_lower_vs_inputs");
+
default:
nir_emit_intrinsic(bld, instr);
break;
case nir_intrinsic_load_input: {
/* load_input is only used for flat inputs */
unsigned base = nir_intrinsic_base(instr);
- unsigned component = nir_intrinsic_component(instr);
+ unsigned comp = nir_intrinsic_component(instr);
unsigned num_components = instr->num_components;
enum brw_reg_type type = dest.type;
/* Special case fields in the VUE header */
if (base == VARYING_SLOT_LAYER)
- component = 1;
+ comp = 1;
else if (base == VARYING_SLOT_VIEWPORT)
- component = 2;
+ comp = 2;
if (nir_dest_bit_size(instr->dest) == 64) {
/* const_index is in 32-bit type size units that could not be aligned
}
for (unsigned int i = 0; i < num_components; i++) {
- struct brw_reg interp = interp_reg(base, component + i);
- interp = suboffset(interp, 3);
- bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
- retype(fs_reg(interp), type));
+ bld.MOV(offset(retype(dest, type), bld, i),
+ retype(component(interp_reg(base, comp + i), 3), type));
}
if (nir_dest_bit_size(instr->dest) == 64) {
for (unsigned int i = 0; i < instr->num_components; i++) {
fs_reg interp =
- fs_reg(interp_reg(nir_intrinsic_base(instr),
- nir_intrinsic_component(instr) + i));
+ component(interp_reg(nir_intrinsic_base(instr),
+ nir_intrinsic_component(instr) + i), 0);
interp.type = BRW_REGISTER_TYPE_F;
dest.type = BRW_REGISTER_TYPE_F;
}
}
+static fs_reg
+brw_nir_reduction_op_identity(const fs_builder &bld,
+ nir_op op, brw_reg_type type)
+{
+ nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
+ switch (type_sz(type)) {
+ case 2:
+ assert(type != BRW_REGISTER_TYPE_HF);
+ return retype(brw_imm_uw(value.u16[0]), type);
+ case 4:
+ return retype(brw_imm_ud(value.u32[0]), type);
+ case 8:
+ if (type == BRW_REGISTER_TYPE_DF)
+ return setup_imm_df(bld, value.f64[0]);
+ else
+ return retype(brw_imm_u64(value.u64[0]), type);
+ default:
+ unreachable("Invalid type size");
+ }
+}
+
+static opcode
+brw_op_for_nir_reduction_op(nir_op op)
+{
+ switch (op) {
+ case nir_op_iadd: return BRW_OPCODE_ADD;
+ case nir_op_fadd: return BRW_OPCODE_ADD;
+ case nir_op_imul: return BRW_OPCODE_MUL;
+ case nir_op_fmul: return BRW_OPCODE_MUL;
+ case nir_op_imin: return BRW_OPCODE_SEL;
+ case nir_op_umin: return BRW_OPCODE_SEL;
+ case nir_op_fmin: return BRW_OPCODE_SEL;
+ case nir_op_imax: return BRW_OPCODE_SEL;
+ case nir_op_umax: return BRW_OPCODE_SEL;
+ case nir_op_fmax: return BRW_OPCODE_SEL;
+ case nir_op_iand: return BRW_OPCODE_AND;
+ case nir_op_ior: return BRW_OPCODE_OR;
+ case nir_op_ixor: return BRW_OPCODE_XOR;
+ default:
+ unreachable("Invalid reduction operation");
+ }
+}
+
+static brw_conditional_mod
+brw_cond_mod_for_nir_reduction_op(nir_op op)
+{
+ switch (op) {
+ case nir_op_iadd: return BRW_CONDITIONAL_NONE;
+ case nir_op_fadd: return BRW_CONDITIONAL_NONE;
+ case nir_op_imul: return BRW_CONDITIONAL_NONE;
+ case nir_op_fmul: return BRW_CONDITIONAL_NONE;
+ case nir_op_imin: return BRW_CONDITIONAL_L;
+ case nir_op_umin: return BRW_CONDITIONAL_L;
+ case nir_op_fmin: return BRW_CONDITIONAL_L;
+ case nir_op_imax: return BRW_CONDITIONAL_GE;
+ case nir_op_umax: return BRW_CONDITIONAL_GE;
+ case nir_op_fmax: return BRW_CONDITIONAL_GE;
+ case nir_op_iand: return BRW_CONDITIONAL_NONE;
+ case nir_op_ior: return BRW_CONDITIONAL_NONE;
+ case nir_op_ixor: return BRW_CONDITIONAL_NONE;
+ default:
+ unreachable("Invalid reduction operation");
+ }
+}
+
void
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
{
dest = get_nir_dest(instr->dest);
switch (instr->intrinsic) {
- case nir_intrinsic_atomic_counter_inc:
- case nir_intrinsic_atomic_counter_dec:
- case nir_intrinsic_atomic_counter_read:
- case nir_intrinsic_atomic_counter_add:
- case nir_intrinsic_atomic_counter_min:
- case nir_intrinsic_atomic_counter_max:
- case nir_intrinsic_atomic_counter_and:
- case nir_intrinsic_atomic_counter_or:
- case nir_intrinsic_atomic_counter_xor:
- case nir_intrinsic_atomic_counter_exchange:
- case nir_intrinsic_atomic_counter_comp_swap: {
- if (stage == MESA_SHADER_FRAGMENT &&
- instr->intrinsic != nir_intrinsic_atomic_counter_read)
- brw_wm_prog_data(prog_data)->has_side_effects = true;
-
- /* Get some metadata from the image intrinsic. */
- const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
-
- /* Get the arguments of the atomic intrinsic. */
- const fs_reg offset = get_nir_src(instr->src[0]);
- const unsigned surface = (stage_prog_data->binding_table.abo_start +
- instr->const_index[0]);
- const fs_reg src0 = (info->num_srcs >= 2
- ? get_nir_src(instr->src[1]) : fs_reg());
- const fs_reg src1 = (info->num_srcs >= 3
- ? get_nir_src(instr->src[2]) : fs_reg());
- fs_reg tmp;
-
- assert(info->num_srcs <= 3);
-
- /* Emit a surface read or atomic op. */
- if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
- tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
- } else {
- tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
- src1, 1, 1,
- get_atomic_counter_op(instr->intrinsic));
- }
-
- /* Assign the result. */
- bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
-
- /* Mark the surface as used. */
- brw_mark_surface_used(stage_prog_data, surface);
- break;
- }
-
- case nir_intrinsic_image_load:
- case nir_intrinsic_image_store:
- case nir_intrinsic_image_atomic_add:
- case nir_intrinsic_image_atomic_min:
- case nir_intrinsic_image_atomic_max:
- case nir_intrinsic_image_atomic_and:
- case nir_intrinsic_image_atomic_or:
- case nir_intrinsic_image_atomic_xor:
- case nir_intrinsic_image_atomic_exchange:
- case nir_intrinsic_image_atomic_comp_swap: {
+ case nir_intrinsic_image_var_load:
+ case nir_intrinsic_image_var_store:
+ case nir_intrinsic_image_var_atomic_add:
+ case nir_intrinsic_image_var_atomic_min:
+ case nir_intrinsic_image_var_atomic_max:
+ case nir_intrinsic_image_var_atomic_and:
+ case nir_intrinsic_image_var_atomic_or:
+ case nir_intrinsic_image_var_atomic_xor:
+ case nir_intrinsic_image_var_atomic_exchange:
+ case nir_intrinsic_image_var_atomic_comp_swap: {
using namespace image_access;
if (stage == MESA_SHADER_FRAGMENT &&
- instr->intrinsic != nir_intrinsic_image_load)
+ instr->intrinsic != nir_intrinsic_image_var_load)
brw_wm_prog_data(prog_data)->has_side_effects = true;
/* Get the referenced image variable and type. */
const unsigned arr_dims = type->sampler_array ? 1 : 0;
const unsigned surf_dims = type->coordinate_components() - arr_dims;
const unsigned format = var->data.image.format;
+ const unsigned dest_components = nir_intrinsic_dest_components(instr);
/* Get the arguments of the image intrinsic. */
const fs_reg image = get_nir_image_deref(instr->variables[0]);
fs_reg tmp;
/* Emit an image load, store or atomic op. */
- if (instr->intrinsic == nir_intrinsic_image_load)
+ if (instr->intrinsic == nir_intrinsic_image_var_load)
tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
- else if (instr->intrinsic == nir_intrinsic_image_store)
+ else if (instr->intrinsic == nir_intrinsic_image_var_store)
emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
var->data.image.write_only ? GL_NONE : format);
else
tmp = emit_image_atomic(bld, image, addr, src0, src1,
- surf_dims, arr_dims, info->dest_components,
+ surf_dims, arr_dims, dest_components,
get_image_atomic_op(instr->intrinsic, type));
/* Assign the result. */
- for (unsigned c = 0; c < info->dest_components; ++c)
+ for (unsigned c = 0; c < dest_components; ++c) {
bld.MOV(offset(retype(dest, base_type), bld, c),
- offset(tmp, bld, c));
+ offset(tmp, bld, c));
+ }
break;
}
break;
}
- case nir_intrinsic_image_size: {
+ case nir_intrinsic_image_var_size: {
/* Get the referenced image variable and type. */
const nir_variable *var = instr->variables[0]->var;
const glsl_type *type = var->type->without_array();
break;
}
- case nir_intrinsic_image_samples:
+ case nir_intrinsic_image_var_samples:
/* The driver does not support multi-sampled images. */
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
break;
case nir_intrinsic_load_uniform: {
- /* Offsets are in bytes but they should always be multiples of 4 */
- assert(instr->const_index[0] % 4 == 0);
+ /* Offsets are in bytes but they should always aligned to
+ * the type size
+ */
+ assert(instr->const_index[0] % 4 == 0 ||
+ instr->const_index[0] % type_sz(dest.type) == 0);
fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
if (const_offset) {
- /* Offsets are in bytes but they should always be multiples of 4 */
- assert(const_offset->u32[0] % 4 == 0);
- src.offset = const_offset->u32[0];
+ assert(const_offset->u32[0] % type_sz(dest.type) == 0);
+ /* For 16-bit types we add the module of the const_index[0]
+ * offset to access to not 32-bit aligned element
+ */
+ src.offset = const_offset->u32[0] + instr->const_index[0] % 4;
for (unsigned j = 0; j < instr->num_components; j++) {
bld.MOV(offset(dest, bld, j), offset(src, bld, j));
if (const_offset) {
offset_reg = brw_imm_ud(const_offset->u32[0]);
} else {
- offset_reg = get_nir_src(instr->src[1]);
+ offset_reg = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD);
}
/* Read the vector */
* Also, we have to suffle 64-bit data to be in the appropriate layout
* expected by our 32-bit write messages.
*/
- unsigned type_size = 4;
- if (nir_src_bit_size(instr->src[0]) == 64) {
- type_size = 8;
- val_reg = shuffle_64bit_data_for_32bit_write(bld,
- val_reg, instr->num_components);
- }
-
- unsigned type_slots = type_size / 4;
+ unsigned bit_size = nir_src_bit_size(instr->src[0]);
+ unsigned type_size = bit_size / 8;
/* Combine groups of consecutive enabled channels in one write
* message. We use ffs to find the first enabled channel and then ffs on
- * the bit-inverse, down-shifted writemask to determine the length of
- * the block of enabled bits.
+ * the bit-inverse, down-shifted writemask to determine the num_components
+ * of the block of enabled bits.
*/
while (writemask) {
unsigned first_component = ffs(writemask) - 1;
- unsigned length = ffs(~(writemask >> first_component)) - 1;
+ unsigned num_components = ffs(~(writemask >> first_component)) - 1;
+ fs_reg write_src = offset(val_reg, bld, first_component);
- /* We can't write more than 2 64-bit components at once. Limit the
- * length of the write to what we can do and let the next iteration
- * handle the rest
- */
- if (type_size > 4)
- length = MIN2(2, length);
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+
+ if (type_size > 4) {
+ /* We can't write more than 2 64-bit components at once. Limit
+ * the num_components of the write to what we can do and let the next
+ * iteration handle the rest.
+ */
+ num_components = MIN2(2, num_components);
+ write_src = shuffle_64bit_data_for_32bit_write(bld, write_src,
+ num_components);
+ } else if (type_size < 4) {
+ assert(type_size == 2);
+ /* For 16-bit types we pack two consecutive values into a 32-bit
+ * word and use an untyped write message. For single values or not
+ * 32-bit-aligned we need to use byte-scattered writes because
+ * untyped writes works with 32-bit components with 32-bit
+ * alignment. byte_scattered_write messages only support one
+ * 16-bit component at a time. As VK_KHR_relaxed_block_layout
+ * could be enabled we can not guarantee that not constant offsets
+ * to be 32-bit aligned for 16-bit types. For example an array, of
+ * 16-bit vec3 with array element stride of 6.
+ *
+ * In the case of 32-bit aligned constant offsets if there is
+ * a 3-components vector we submit one untyped-write message
+ * of 32-bit (first two components), and one byte-scattered
+ * write message (the last component).
+ */
+
+ if ( !const_offset || ((const_offset->u32[0] +
+ type_size * first_component) % 4)) {
+ /* If we use a .yz writemask we also need to emit 2
+ * byte-scattered write messages because of y-component not
+ * being aligned to 32-bit.
+ */
+ num_components = 1;
+ } else if (num_components > 2 && (num_components % 2)) {
+ /* If there is an odd number of consecutive components we left
+ * the not paired component for a following emit of length == 1
+ * with byte_scattered_write.
+ */
+ num_components --;
+ }
+ /* For num_components == 1 we are also shuffling the component
+ * because byte scattered writes of 16-bit need values to be dword
+ * aligned. Shuffling only one component would be the same as
+ * striding it.
+ */
+ write_src = shuffle_for_32bit_write(bld, write_src, 0,
+ num_components);
+ }
fs_reg offset_reg;
- nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+
if (const_offset) {
offset_reg = brw_imm_ud(const_offset->u32[0] +
type_size * first_component);
brw_imm_ud(type_size * first_component));
}
-
- emit_untyped_write(bld, surf_index, offset_reg,
- offset(val_reg, bld, first_component * type_slots),
- 1 /* dims */, length * type_slots,
- BRW_PREDICATE_NONE);
+ if (type_size < 4 && num_components == 1) {
+ assert(type_size == 2);
+ /* Untyped Surface messages have a fixed 32-bit size, so we need
+ * to rely on byte scattered in order to write 16-bit elements.
+ * The byte_scattered_write message needs that every written 16-bit
+ * type to be aligned 32-bits (stride=2).
+ */
+ emit_byte_scattered_write(bld, surf_index, offset_reg,
+ write_src,
+ 1 /* dims */, 1,
+ bit_size,
+ BRW_PREDICATE_NONE);
+ } else {
+ assert(num_components * type_size <= 16);
+ assert((num_components * type_size) % 4 == 0);
+ assert(offset_reg.file != BRW_IMMEDIATE_VALUE ||
+ offset_reg.ud % 4 == 0);
+ unsigned num_slots = (num_components * type_size) / 4;
+
+ emit_untyped_write(bld, surf_index, offset_reg,
+ write_src,
+ 1 /* dims */, num_slots,
+ BRW_PREDICATE_NONE);
+ }
/* Clear the bits in the writemask that we just wrote, then try
* again to see if more channels are left.
*/
- writemask &= (15 << (first_component + length));
+ writemask &= (15 << (first_component + num_components));
}
break;
}
ubld.MOV(src_payload, brw_imm_d(0));
const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
- fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
+ fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
src_payload, brw_imm_ud(index));
inst->header_size = 0;
inst->mlen = 1;
inst->size_written = 4 * REG_SIZE;
- bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
+ /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
+ *
+ * "Out-of-bounds checking is always performed at a DWord granularity. If
+ * any part of the DWord is out-of-bounds then the whole DWord is
+ * considered out-of-bounds."
+ *
+ * This implies that types with size smaller than 4-bytes need to be
+ * padded if they don't complete the last dword of the buffer. But as we
+ * need to maintain the original size we need to reverse the padding
+ * calculation to return the correct size to know the number of elements
+ * of an unsized array. As we stored in the last two bits of the surface
+ * size the needed padding for the buffer, we calculate here the
+ * original buffer_size reversing the surface_size calculation:
+ *
+ * surface_size = isl_align(buffer_size, 4) +
+ * (isl_align(buffer_size) - buffer_size)
+ *
+ * buffer_size = surface_size & ~3 - surface_size & 3
+ */
+
+ fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
+ ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
+ ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
+
+ bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
+
brw_mark_surface_used(prog_data, index);
break;
}
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
break;
}
- case nir_intrinsic_vote_eq: {
+ case nir_intrinsic_vote_feq:
+ case nir_intrinsic_vote_ieq: {
fs_reg value = get_nir_src(instr->src[0]);
+ if (instr->intrinsic == nir_intrinsic_vote_feq) {
+ const unsigned bit_size = nir_src_bit_size(instr->src[0]);
+ value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
+ }
+
fs_reg uniformized = bld.emit_uniformize(value);
const fs_builder ubld = bld.exec_all().group(1, 0);
break;
}
+ case nir_intrinsic_shuffle: {
+ const fs_reg value = get_nir_src(instr->src[0]);
+ const fs_reg index = get_nir_src(instr->src[1]);
+
+ bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
+ break;
+ }
+
+ case nir_intrinsic_first_invocation: {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
+ bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+ fs_reg(component(tmp, 0)));
+ break;
+ }
+
+ case nir_intrinsic_quad_broadcast: {
+ const fs_reg value = get_nir_src(instr->src[0]);
+ nir_const_value *index = nir_src_as_const_value(instr->src[1]);
+ assert(nir_src_bit_size(instr->src[1]) == 32);
+
+ bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
+ value, brw_imm_ud(index->u32[0]), brw_imm_ud(4));
+ break;
+ }
+
+ case nir_intrinsic_quad_swap_horizontal: {
+ const fs_reg value = get_nir_src(instr->src[0]);
+ const fs_reg tmp = bld.vgrf(value.type);
+ const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
+
+ const fs_reg src_left = horiz_stride(value, 2);
+ const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
+ const fs_reg tmp_left = horiz_stride(tmp, 2);
+ const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
+
+ /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
+ *
+ * "When source or destination datatype is 64b or operation is
+ * integer DWord multiply, regioning in Align1 must follow
+ * these rules:
+ *
+ * [...]
+ *
+ * 3. Source and Destination offset must be the same, except
+ * the case of scalar source."
+ *
+ * In order to work around this, we have to emit two 32-bit MOVs instead
+ * of a single 64-bit MOV to do the shuffle.
+ */
+ if (type_sz(value.type) > 4 &&
+ (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+ ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 0),
+ subscript(src_right, BRW_REGISTER_TYPE_D, 0));
+ ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 1),
+ subscript(src_right, BRW_REGISTER_TYPE_D, 1));
+ ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 0),
+ subscript(src_left, BRW_REGISTER_TYPE_D, 0));
+ ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 1),
+ subscript(src_left, BRW_REGISTER_TYPE_D, 1));
+ } else {
+ ubld.MOV(tmp_left, src_right);
+ ubld.MOV(tmp_right, src_left);
+ }
+ bld.MOV(retype(dest, value.type), tmp);
+ break;
+ }
+
+ case nir_intrinsic_quad_swap_vertical: {
+ const fs_reg value = get_nir_src(instr->src[0]);
+ if (nir_src_bit_size(instr->src[0]) == 32) {
+ /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
+ const fs_reg tmp = bld.vgrf(value.type);
+ const fs_builder ubld = bld.exec_all();
+ ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
+ brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
+ bld.MOV(retype(dest, value.type), tmp);
+ } else {
+ /* For larger data types, we have to either emit dispatch_width many
+ * MOVs or else fall back to doing indirects.
+ */
+ fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+ bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+ brw_imm_w(0x2));
+ bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
+ }
+ break;
+ }
+
+ case nir_intrinsic_quad_swap_diagonal: {
+ const fs_reg value = get_nir_src(instr->src[0]);
+ if (nir_src_bit_size(instr->src[0]) == 32) {
+ /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
+ const fs_reg tmp = bld.vgrf(value.type);
+ const fs_builder ubld = bld.exec_all();
+ ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
+ brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
+ bld.MOV(retype(dest, value.type), tmp);
+ } else {
+ /* For larger data types, we have to either emit dispatch_width many
+ * MOVs or else fall back to doing indirects.
+ */
+ fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+ bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+ brw_imm_w(0x3));
+ bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
+ }
+ break;
+ }
+
+ case nir_intrinsic_reduce: {
+ fs_reg src = get_nir_src(instr->src[0]);
+ nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+ unsigned cluster_size = nir_intrinsic_cluster_size(instr);
+ if (cluster_size == 0 || cluster_size > dispatch_width)
+ cluster_size = dispatch_width;
+
+ /* Figure out the source type */
+ src.type = brw_type_for_nir_type(devinfo,
+ (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+ nir_src_bit_size(instr->src[0])));
+
+ fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+ opcode brw_op = brw_op_for_nir_reduction_op(redop);
+ brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+ /* Set up a register for all of our scratching around and initialize it
+ * to reduction operation's identity value.
+ */
+ fs_reg scan = bld.vgrf(src.type);
+ bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+ bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
+
+ dest.type = src.type;
+ if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
+ /* In this case, CLUSTER_BROADCAST instruction isn't needed because
+ * the distance between clusters is at least 2 GRFs. In this case,
+ * we don't need the weird striding of the CLUSTER_BROADCAST
+ * instruction and can just do regular MOVs.
+ */
+ assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
+ const unsigned groups =
+ (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
+ const unsigned group_size = dispatch_width / groups;
+ for (unsigned i = 0; i < groups; i++) {
+ const unsigned cluster = (i * group_size) / cluster_size;
+ const unsigned comp = cluster * cluster_size + (cluster_size - 1);
+ bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
+ component(scan, comp));
+ }
+ } else {
+ bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
+ brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
+ }
+ break;
+ }
+
+ case nir_intrinsic_inclusive_scan:
+ case nir_intrinsic_exclusive_scan: {
+ fs_reg src = get_nir_src(instr->src[0]);
+ nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+
+ /* Figure out the source type */
+ src.type = brw_type_for_nir_type(devinfo,
+ (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+ nir_src_bit_size(instr->src[0])));
+
+ fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+ opcode brw_op = brw_op_for_nir_reduction_op(redop);
+ brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+ /* Set up a register for all of our scratching around and initialize it
+ * to reduction operation's identity value.
+ */
+ fs_reg scan = bld.vgrf(src.type);
+ const fs_builder allbld = bld.exec_all();
+ allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+ if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
+ /* Exclusive scan is a bit harder because we have to do an annoying
+ * shift of the contents before we can begin. To make things worse,
+ * we can't do this with a normal stride; we have to use indirects.
+ */
+ fs_reg shifted = bld.vgrf(src.type);
+ fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+ allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+ brw_imm_w(-1));
+ allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
+ allbld.group(1, 0).MOV(component(shifted, 0), identity);
+ scan = shifted;
+ }
+
+ bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
+
+ bld.MOV(retype(dest, src.type), scan);
+ break;
+ }
+
+ case nir_intrinsic_begin_invocation_interlock: {
+ const fs_builder ubld = bld.group(8, 0);
+ const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+
+ ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
+ REG_SIZE;
+
+ break;
+ }
+
+ case nir_intrinsic_end_invocation_interlock: {
+ /* We don't need to do anything here */
+ break;
+ }
+
default:
unreachable("unknown intrinsic");
}
return dst;
}
+/*
+ * This helper takes a source register and un/shuffles it into the destination
+ * register.
+ *
+ * If source type size is smaller than destination type size the operation
+ * needed is a component shuffle. The opposite case would be an unshuffle. If
+ * source/destination type size is equal a shuffle is done that would be
+ * equivalent to a simple MOV.
+ *
+ * For example, if source is a 16-bit type and destination is 32-bit. A 3
+ * components .xyz 16-bit vector on SIMD8 would be.
+ *
+ * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
+ * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |
+ *
+ * This helper will return the following 2 32-bit components with the 16-bit
+ * values shuffled:
+ *
+ * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
+ * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |
+ *
+ * For unshuffle, the example would be the opposite, a 64-bit type source
+ * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
+ * would be:
+ *
+ * | x1l x1h | x2l x2h | x3l x3h | x4l x4h |
+ * | x5l x5h | x6l x6h | x7l x7h | x8l x8h |
+ * | y1l y1h | y2l y2h | y3l y3h | y4l y4h |
+ * | y5l y5h | y6l y6h | y7l y7h | y8l y8h |
+ *
+ * The returned result would be the following 4 32-bit components unshuffled:
+ *
+ * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
+ * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
+ * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
+ * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
+ *
+ * - Source and destination register must not be overlapped.
+ * - components units are measured in terms of the smaller type between
+ * source and destination because we are un/shuffling the smaller
+ * components from/into the bigger ones.
+ * - first_component parameter allows skipping source components.
+ */
+void
+shuffle_src_to_dst(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t first_component,
+ uint32_t components)
+{
+ if (type_sz(src.type) == type_sz(dst.type)) {
+ assert(!regions_overlap(dst,
+ type_sz(dst.type) * bld.dispatch_width() * components,
+ offset(src, bld, first_component),
+ type_sz(src.type) * bld.dispatch_width() * components));
+ for (unsigned i = 0; i < components; i++) {
+ bld.MOV(retype(offset(dst, bld, i), src.type),
+ offset(src, bld, i + first_component));
+ }
+ } else if (type_sz(src.type) < type_sz(dst.type)) {
+ /* Source is shuffled into destination */
+ unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
+ assert(!regions_overlap(dst,
+ type_sz(dst.type) * bld.dispatch_width() *
+ DIV_ROUND_UP(components, size_ratio),
+ offset(src, bld, first_component),
+ type_sz(src.type) * bld.dispatch_width() * components));
+
+ brw_reg_type shuffle_type =
+ brw_reg_type_from_bit_size(8 * type_sz(src.type),
+ BRW_REGISTER_TYPE_D);
+ for (unsigned i = 0; i < components; i++) {
+ fs_reg shuffle_component_i =
+ subscript(offset(dst, bld, i / size_ratio),
+ shuffle_type, i % size_ratio);
+ bld.MOV(shuffle_component_i,
+ retype(offset(src, bld, i + first_component), shuffle_type));
+ }
+ } else {
+ /* Source is unshuffled into destination */
+ unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
+ assert(!regions_overlap(dst,
+ type_sz(dst.type) * bld.dispatch_width() * components,
+ offset(src, bld, first_component / size_ratio),
+ type_sz(src.type) * bld.dispatch_width() *
+ DIV_ROUND_UP(components + (first_component % size_ratio),
+ size_ratio)));
+
+ brw_reg_type shuffle_type =
+ brw_reg_type_from_bit_size(8 * type_sz(dst.type),
+ BRW_REGISTER_TYPE_D);
+ for (unsigned i = 0; i < components; i++) {
+ fs_reg shuffle_component_i =
+ subscript(offset(src, bld, (first_component + i) / size_ratio),
+ shuffle_type, (first_component + i) % size_ratio);
+ bld.MOV(retype(offset(dst, bld, i), shuffle_type),
+ shuffle_component_i);
+ }
+ }
+}
+
+void
+shuffle_from_32bit_read(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t first_component,
+ uint32_t components)
+{
+ assert(type_sz(src.type) == 4);
+
+ /* This function takes components in units of the destination type while
+ * shuffle_src_to_dst takes components in units of the smallest type
+ */
+ if (type_sz(dst.type) > 4) {
+ assert(type_sz(dst.type) == 8);
+ first_component *= 2;
+ components *= 2;
+ }
+
+ shuffle_src_to_dst(bld, dst, src, first_component, components);
+}
+
+fs_reg
+shuffle_for_32bit_write(const fs_builder &bld,
+ const fs_reg &src,
+ uint32_t first_component,
+ uint32_t components)
+{
+ fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
+ DIV_ROUND_UP (components * type_sz(src.type), 4));
+ /* This function takes components in units of the source type while
+ * shuffle_src_to_dst takes components in units of the smallest type
+ */
+ if (type_sz(src.type) > 4) {
+ assert(type_sz(src.type) == 8);
+ first_component *= 2;
+ components *= 2;
+ }
+
+ shuffle_src_to_dst(bld, dst, src, first_component, components);
+
+ return dst;
+}
+
fs_reg
setup_imm_df(const fs_builder &bld, double v)
{