case UNIFORM:
return 4 * type_sz(src[arg].type);
default:
- /* XXX - Represent actual execution size and vertical stride. */
- return 8 * type_sz(src[arg].type);
+ /* XXX - Represent actual vertical stride. */
+ return exec_size * type_sz(src[arg].type);
}
}
{
switch (opcode) {
case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ case VEC4_OPCODE_FROM_DOUBLE:
+ case VEC4_OPCODE_TO_DOUBLE:
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ case VEC4_OPCODE_PICK_HIGH_32BIT:
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ case VEC4_OPCODE_SET_HIGH_32BIT:
case VS_OPCODE_PULL_CONSTANT_LOAD:
case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
inst->src[0].file == IMM &&
inst->predicate == BRW_PREDICATE_NONE &&
inst->dst.writemask != WRITEMASK_XYZW &&
+ type_sz(inst->src[0].type) < 8 &&
(inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
vf = brw_float_to_vf(inst->src[0].d);
case BRW_OPCODE_DP2:
swizzle = brw_swizzle_for_size(2);
break;
+
+ case VEC4_OPCODE_TO_DOUBLE:
+ case VEC4_OPCODE_FROM_DOUBLE:
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ case VEC4_OPCODE_PICK_HIGH_32BIT:
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ case VEC4_OPCODE_SET_HIGH_32BIT:
+ swizzle = brw_swizzle_for_size(4);
+ break;
+
default:
swizzle = brw_swizzle_for_mask(inst->dst.writemask);
break;
if (inst->src[i].file != UNIFORM)
continue;
+ assert(type_sz(inst->src[i].type) % 4 == 0);
+ unsigned channel_size = type_sz(inst->src[i].type) / 4;
+
int reg = inst->src[i].nr;
for (int c = 0; c < 4; c++) {
if (!(readmask & (1 << c)))
continue;
- chans_used[reg] = MAX2(chans_used[reg],
- BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
+ unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
+ unsigned used = MAX2(chans_used[reg], channel * channel_size);
+ if (used <= 4)
+ chans_used[reg] = used;
+ else
+ chans_used[reg + 1] = used - 4;
}
}
int dst;
/* Find the lowest place we can slot this uniform in. */
for (dst = 0; dst < src; dst++) {
- if (chans_used[dst] + size <= 4)
- break;
+ if (chans_used[dst] + size <= 4)
+ break;
}
if (src == dst) {
- new_loc[src] = dst;
- new_chan[src] = 0;
+ new_loc[src] = dst;
+ new_chan[src] = 0;
} else {
- new_loc[src] = dst;
- new_chan[src] = chans_used[dst];
+ new_loc[src] = dst;
+ new_chan[src] = chans_used[dst];
- /* Move the references to the data */
- for (int j = 0; j < size; j++) {
- stage_prog_data->param[dst * 4 + new_chan[src] + j] =
- stage_prog_data->param[src * 4 + j];
- }
+ /* Move the references to the data */
+ for (int j = 0; j < size; j++) {
+ stage_prog_data->param[dst * 4 + new_chan[src] + j] =
+ stage_prog_data->param[src * 4 + j];
+ }
- chans_used[dst] += size;
- chans_used[src] = 0;
+ chans_used[dst] += size;
+ chans_used[src] = 0;
}
new_uniform_count = MAX2(new_uniform_count, dst + 1);
for (int i = 0 ; i < 3; i++) {
int src = inst->src[i].nr;
- if (inst->src[i].file != UNIFORM)
- continue;
+ if (inst->src[i].file != UNIFORM)
+ continue;
inst->src[i].nr = new_loc[src];
inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
(reg.type == BRW_REGISTER_TYPE_UD || \
reg.type == BRW_REGISTER_TYPE_D)
- /* "When source or destination datatype is 64b or operation is integer DWord
+ /* From the Cherryview and Broadwell PRMs:
+ *
+ * "When source or destination datatype is 64b or operation is integer DWord
* multiply, DepCtrl must not be used."
- * May apply to future SoCs as well.
+ *
+ * SKL PRMs don't include this restriction though.
*/
- if (devinfo->is_cherryview) {
+ if (devinfo->gen == 8 || devinfo->is_broxton) {
if (inst->opcode == BRW_OPCODE_MUL &&
IS_DWORD(inst->src[0]) &&
IS_DWORD(inst->src[1]))
/* Can't coalesce this GRF if someone else was going to
* read it later.
*/
- if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 4) > ip)
+ if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
continue;
/* We need to check interference with the final destination between this
scan_inst->dst.type == scan_inst->src[0].type))
break;
+ /* Only allow coalescing between registers of the same type size.
+ * Otherwise we would need to make the pass aware of the fact that
+ * channel sizes are different for single and double precision.
+ */
+ if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
+ break;
+
+ /* Check that scan_inst writes the same amount of data as the
+ * instruction, otherwise coalescing would lead to writing a
+ * different (larger or smaller) region of the destination
+ */
+ if (scan_inst->size_written != inst->size_written)
+ break;
+
/* If we can't handle the swizzle, bail. */
if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
inst->src[0].swizzle,
break;
}
- /* This only handles coalescing of a single register starting at
- * the source offset of the copy instruction.
+ /* This only handles coalescing writes of 8 channels (1 register
+ * for single-precision and 2 registers for double-precision)
+ * starting at the source offset of the copy instruction.
*/
- if (scan_inst->size_written > REG_SIZE ||
+ if (DIV_ROUND_UP(scan_inst->size_written,
+ type_sz(scan_inst->dst.type)) > 8 ||
scan_inst->dst.offset != inst->src[0].offset)
break;
pred_ctrl_align16[inst->predicate]);
}
- fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
+ fprintf(file, "%s(%d)", brw_instruction_name(devinfo, inst->opcode),
+ inst->exec_size);
if (inst->saturate)
fprintf(file, ".sat");
if (inst->conditional_mod) {
case BRW_REGISTER_TYPE_F:
fprintf(file, "%fF", inst->src[i].f);
break;
+ case BRW_REGISTER_TYPE_DF:
+ fprintf(file, "%fDF", inst->src[i].df);
+ break;
case BRW_REGISTER_TYPE_D:
fprintf(file, "%dD", inst->src[i].d);
break;
if (inst->force_writemask_all)
fprintf(file, " NoMask");
+ if (inst->exec_size != 8)
+ fprintf(file, " group%d", inst->group);
+
fprintf(file, "\n");
}
struct src_reg &src = inst->src[i];
struct brw_reg reg;
switch (src.file) {
- case VGRF:
- reg = byte_offset(brw_vec8_grf(src.nr, 0), src.offset);
+ case VGRF: {
+ const unsigned type_size = type_sz(src.type);
+ const unsigned width = REG_SIZE / 2 / MAX2(4, type_size);
+ reg = byte_offset(brw_vecn_grf(width, src.nr, 0), src.offset);
reg.type = src.type;
- reg.swizzle = src.swizzle;
reg.abs = src.abs;
reg.negate = src.negate;
break;
+ }
- case UNIFORM:
+ case UNIFORM: {
+ const unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(src.type));
reg = stride(byte_offset(brw_vec4_grf(
prog_data->base.dispatch_grf_start_reg +
src.nr / 2, src.nr % 2 * 4),
src.offset),
- 0, 4, 1);
+ 0, width, 1);
reg.type = src.type;
- reg.swizzle = src.swizzle;
reg.abs = src.abs;
reg.negate = src.negate;
/* This should have been moved to pull constants. */
assert(!src.reladdr);
break;
+ }
- case ARF:
case FIXED_GRF:
+ if (type_sz(src.type) == 8) {
+ reg = src.as_brw_reg();
+ break;
+ }
+ /* fallthrough */
+ case ARF:
case IMM:
continue;
unreachable("not reached");
}
+ apply_logical_swizzle(®, inst, i);
src = reg;
}
if (inst->is_3src(devinfo)) {
/* 3-src instructions with scalar sources support arbitrary subnr,
* but don't actually use swizzles. Convert swizzle into subnr.
+ * Skip this for double-precision instructions: RepCtrl=1 is not
+ * allowed for them and needs special handling.
*/
for (int i = 0; i < 3; i++) {
- if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) {
+ if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
+ type_sz(inst->src[i].type) < 8) {
assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
}
}
}
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst. The instruction will be left untouched by
+ * vec4_visitor::lower_simd_width() if the returned value matches the
+ * instruction's original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct gen_device_info *devinfo,
+ const vec4_instruction *inst)
+{
+ unsigned lowered_width = MIN2(16, inst->exec_size);
+
+ /* We need to split some cases of double-precision instructions that write
+ * 2 registers. We only need to care about this in gen7 because that is the
+ * only hardware that implements fp64 in Align16.
+ */
+ if (devinfo->gen == 7 && inst->size_written > REG_SIZE) {
+ /* Align16 8-wide double-precision SEL does not work well. Verified
+ * empirically.
+ */
+ if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8)
+ lowered_width = MIN2(lowered_width, 4);
+
+ /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
+ * Register Addressing:
+ *
+ * "When destination spans two registers, the source MUST span two
+ * registers."
+ */
+ for (unsigned i = 0; i < 3; i++) {
+ if (inst->src[i].file == BAD_FILE)
+ continue;
+ if (inst->size_read(i) <= REG_SIZE)
+ lowered_width = MIN2(lowered_width, 4);
+ }
+ }
+
+ return lowered_width;
+}
+
+static bool
+dst_src_regions_overlap(vec4_instruction *inst)
+{
+ if (inst->size_written == 0)
+ return false;
+
+ unsigned dst_start = inst->dst.offset;
+ unsigned dst_end = dst_start + inst->size_written - 1;
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == BAD_FILE)
+ continue;
+
+ if (inst->dst.file != inst->src[i].file ||
+ inst->dst.nr != inst->src[i].nr)
+ continue;
+
+ unsigned src_start = inst->src[i].offset;
+ unsigned src_end = src_start + inst->size_read(i) - 1;
+
+ if ((dst_start >= src_start && dst_start <= src_end) ||
+ (dst_end >= src_start && dst_end <= src_end) ||
+ (dst_start <= src_start && dst_end >= src_end)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+vec4_visitor::lower_simd_width()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ const unsigned lowered_width = get_lowered_simd_width(devinfo, inst);
+ assert(lowered_width <= inst->exec_size);
+ if (lowered_width == inst->exec_size)
+ continue;
+
+ /* We need to deal with source / destination overlaps when splitting.
+ * The hardware supports reading from and writing to the same register
+ * in the same instruction, but we need to be careful that each split
+ * instruction we produce does not corrupt the source of the next.
+ *
+ * The easiest way to handle this is to make the split instructions write
+ * to temporaries if there is an src/dst overlap and then move from the
+ * temporaries to the original destination. We also need to consider
+ * instructions that do partial writes via align1 opcodes, in which case
+ * we need to make sure that the we initialize the temporary with the
+ * value of the instruction's dst.
+ */
+ bool needs_temp = dst_src_regions_overlap(inst);
+ for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) {
+ unsigned channel_offset = lowered_width * n;
+
+ unsigned size_written = lowered_width * type_sz(inst->dst.type);
+
+ /* Create the split instruction from the original so that we copy all
+ * relevant instruction fields, then set the width and calculate the
+ * new dst/src regions.
+ */
+ vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
+ linst->exec_size = lowered_width;
+ linst->group = channel_offset;
+ linst->size_written = size_written;
+
+ /* Compute split dst region */
+ dst_reg dst;
+ if (needs_temp) {
+ unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
+ dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
+ inst->dst.type);
+ if (inst->is_align1_partial_write()) {
+ vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
+ copy->exec_size = lowered_width;
+ copy->group = channel_offset;
+ copy->size_written = size_written;
+ inst->insert_before(block, copy);
+ }
+ } else {
+ dst = horiz_offset(inst->dst, channel_offset);
+ }
+ linst->dst = dst;
+
+ /* Compute split source regions */
+ for (int i = 0; i < 3; i++) {
+ if (linst->src[i].file == BAD_FILE)
+ continue;
+
+ if (!is_uniform(linst->src[i]))
+ linst->src[i] = horiz_offset(linst->src[i], channel_offset);
+ }
+
+ inst->insert_before(block, linst);
+
+ /* If we used a temporary to store the result of the split
+ * instruction, copy the result to the original destination
+ */
+ if (needs_temp) {
+ vec4_instruction *mov =
+ MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
+ mov->exec_size = lowered_width;
+ mov->group = channel_offset;
+ mov->size_written = size_written;
+ mov->predicate = inst->predicate;
+ inst->insert_before(block, mov);
+ }
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+static bool
+is_align1_df(vec4_instruction *inst)
+{
+ switch (inst->opcode) {
+ case VEC4_OPCODE_FROM_DOUBLE:
+ case VEC4_OPCODE_TO_DOUBLE:
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ case VEC4_OPCODE_PICK_HIGH_32BIT:
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ case VEC4_OPCODE_SET_HIGH_32BIT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static brw_predicate
+scalarize_predicate(brw_predicate predicate, unsigned writemask)
+{
+ if (predicate != BRW_PREDICATE_NORMAL)
+ return predicate;
+
+ switch (writemask) {
+ case WRITEMASK_X:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+ case WRITEMASK_Y:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+ case WRITEMASK_Z:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+ case WRITEMASK_W:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+ default:
+ unreachable("invalid writemask");
+ }
+}
+
+bool
+vec4_visitor::scalarize_df()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ /* Skip DF instructions that operate in Align1 mode */
+ if (is_align1_df(inst))
+ continue;
+
+ /* Check if this is a double-precision instruction */
+ bool is_double = type_sz(inst->dst.type) == 8;
+ for (int arg = 0; !is_double && arg < 3; arg++) {
+ is_double = inst->src[arg].file != BAD_FILE &&
+ type_sz(inst->src[arg].type) == 8;
+ }
+
+ if (!is_double)
+ continue;
+
+ /* Generate scalar instructions for each enabled channel */
+ for (unsigned chan = 0; chan < 4; chan++) {
+ unsigned chan_mask = 1 << chan;
+ if (!(inst->dst.writemask & chan_mask))
+ continue;
+
+ vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
+
+ for (unsigned i = 0; i < 3; i++) {
+ unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan);
+ scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz);
+ }
+
+ scalar_inst->dst.writemask = chan_mask;
+
+ if (inst->predicate != BRW_PREDICATE_NONE) {
+ scalar_inst->predicate =
+ scalarize_predicate(inst->predicate, chan_mask);
+ }
+
+ inst->insert_before(block, scalar_inst);
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+bool
+vec4_visitor::lower_64bit_mad_to_mul_add()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ if (inst->opcode != BRW_OPCODE_MAD)
+ continue;
+
+ if (type_sz(inst->dst.type) != 8)
+ continue;
+
+ dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
+
+ /* Use the copy constructor so we copy all relevant instruction fields
+ * from the original mad into the add and mul instructions
+ */
+ vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
+ mul->opcode = BRW_OPCODE_MUL;
+ mul->dst = mul_dst;
+ mul->src[0] = inst->src[1];
+ mul->src[1] = inst->src[2];
+ mul->src[2].file = BAD_FILE;
+
+ vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
+ add->opcode = BRW_OPCODE_ADD;
+ add->src[0] = src_reg(mul_dst);
+ add->src[1] = inst->src[0];
+ add->src[2].file = BAD_FILE;
+
+ inst->insert_before(block, mul);
+ inst->insert_before(block, add);
+ inst->remove(block);
+
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/* The align16 hardware can only do 32-bit swizzle channels, so we need to
+ * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
+ * to 32-bit swizzle channels in hardware registers.
+ *
+ * @inst and @arg identify the original vec4 IR source operand we need to
+ * translate the swizzle for and @hw_reg is the hardware register where we
+ * will write the hardware swizzle to use.
+ *
+ * This pass assumes that Align16/DF instructions have been fully scalarized
+ * previously so there is just one 64-bit swizzle channel to deal with for any
+ * given Vec4 IR source.
+ */
+void
+vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg,
+ vec4_instruction *inst, int arg)
+{
+ src_reg reg = inst->src[arg];
+
+ if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE)
+ return;
+
+ /* If this is not a 64-bit operand or this is a scalar instruction we don't
+ * need to do anything about the swizzles.
+ */
+ if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
+ hw_reg->swizzle = reg.swizzle;
+ return;
+ }
+
+ /* Otherwise we should have scalarized the instruction, so take the single
+ * 64-bit logical swizzle channel and translate it to 32-bit
+ */
+ assert(brw_is_single_value_swizzle(reg.swizzle));
+
+ /* To gain access to Z/W components we need to select the second half
+ * of the register and then use a X/Y swizzle to select Z/W respectively.
+ */
+ unsigned swizzle = BRW_GET_SWZ(reg.swizzle, 0);
+
+ if (swizzle >= 2) {
+ *hw_reg = suboffset(*hw_reg, 2);
+ swizzle -= 2;
+ }
+
+ /* Any 64-bit source with an offset at 16B is intended to address the
+ * second half of a register and needs a vertical stride of 0 so we:
+ *
+ * 1. Don't violate register region restrictions.
+ * 2. Activate the gen7 instruction decompresion bug exploit when
+ * execsize > 4
+ */
+ if (hw_reg->subnr % REG_SIZE == 16) {
+ assert(devinfo->gen == 7);
+ hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+ }
+
+ hw_reg->swizzle = BRW_SWIZZLE4(swizzle * 2, swizzle * 2 + 1,
+ swizzle * 2, swizzle * 2 + 1);
+}
+
bool
vec4_visitor::run()
{
OPT(dead_code_eliminate);
}
+ if (OPT(lower_simd_width)) {
+ OPT(opt_copy_propagation);
+ OPT(dead_code_eliminate);
+ }
+
if (failed)
return false;
+ OPT(lower_64bit_mad_to_mul_add);
+ OPT(scalarize_df);
+
setup_payload();
if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {