unsigned
required_dst_byte_stride(const fs_inst *inst)
{
- if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
+ if (inst->dst.is_accumulator()) {
+ /* If the destination is an accumulator, insist that we leave the
+ * stride alone. We cannot "fix" accumulator destinations by writing
+ * to a temporary and emitting a MOV into the original destination.
+ * For multiply instructions (our one use of the accumulator), the
+ * MUL writes the full 66 bits of the accumulator whereas the MOV we
+ * would emit only writes 33 bits and leaves the top 33 bits
+ * undefined.
+ *
+ * It's safe to just require the original stride here because the
+ * lowering pass will detect the mismatch in has_invalid_src_region
+ * and fix the sources of the multiply instead of the destination.
+ */
+ return inst->dst.stride * type_sz(inst->dst.type);
+ } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
!is_byte_raw_mov(inst)) {
return get_exec_type_size(inst);
} else {
- unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
+ /* Calculate the maximum byte stride and the minimum/maximum type
+ * size across all source and destination operands we are required to
+ * lower.
+ */
+ unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
+ unsigned min_size = type_sz(inst->dst.type);
+ unsigned max_size = type_sz(inst->dst.type);
for (unsigned i = 0; i < inst->sources; i++) {
- if (!is_uniform(inst->src[i]))
- stride = MAX2(stride, inst->src[i].stride *
- type_sz(inst->src[i].type));
+ if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
+ const unsigned size = type_sz(inst->src[i].type);
+ max_stride = MAX2(max_stride, inst->src[i].stride * size);
+ min_size = MIN2(min_size, size);
+ max_size = MAX2(max_size, size);
+ }
}
- return stride;
+ /* All operands involved in lowering need to fit in the calculated
+ * stride.
+ */
+ assert(max_size <= 4 * min_size);
+
+ /* Attempt to use the largest byte stride among all present operands,
+ * but never exceed a stride of 4 since that would lead to illegal
+ * destination regions during lowering.
+ */
+ return MIN2(max_stride, 4 * min_size);
}
}
required_dst_byte_offset(const fs_inst *inst)
{
for (unsigned i = 0; i < inst->sources; i++) {
- if (!is_uniform(inst->src[i]))
+ if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
if (reg_offset(inst->src[i]) % REG_SIZE !=
reg_offset(inst->dst) % REG_SIZE)
return 0;
has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
unsigned i)
{
- if (is_unordered(inst)) {
+ if (is_unordered(inst) || inst->is_control_source(i))
return false;
- } else {
- const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
- const unsigned src_byte_stride = inst->src[i].stride *
- type_sz(inst->src[i].type);
- const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
- const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
- return has_dst_aligned_region_restriction(devinfo, inst) &&
- !is_uniform(inst->src[i]) &&
- (src_byte_stride != dst_byte_stride ||
- src_byte_offset != dst_byte_offset);
+ /* Empirical testing shows that Broadwell has a bug affecting half-float
+ * MAD instructions when any of its sources has a non-zero offset, such
+ * as:
+ *
+ * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
+ *
+ * We used to generate code like this for SIMD8 executions where we
+ * used to pack components Y and W of a vector at offset 16B of a SIMD
+ * register. The problem doesn't occur if the stride of the source is 0.
+ */
+ if (devinfo->gen == 8 &&
+ inst->opcode == BRW_OPCODE_MAD &&
+ inst->src[i].type == BRW_REGISTER_TYPE_HF &&
+ reg_offset(inst->src[i]) % REG_SIZE > 0 &&
+ inst->src[i].stride != 0) {
+ return true;
}
+
+ const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
+ const unsigned src_byte_stride = inst->src[i].stride *
+ type_sz(inst->src[i].type);
+ const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
+ const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
+
+ return has_dst_aligned_region_restriction(devinfo, inst) &&
+ !is_uniform(inst->src[i]) &&
+ (src_byte_stride != dst_byte_stride ||
+ src_byte_offset != dst_byte_offset);
}
/*
const unsigned stride =
type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
- const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride);
+ fs_reg tmp = ibld.vgrf(type, stride);
+ ibld.UNDEF(tmp);
+ tmp = horiz_stride(tmp, stride);
/* Emit a MOV taking care of all the destination modifiers. */
fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
type_sz(inst->src[i].type);
assert(stride > 0);
- const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride),
- stride);
+ fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
+ ibld.UNDEF(tmp);
+ tmp = horiz_stride(tmp, stride);
/* Emit a series of 32-bit integer copies with any source modifiers
* cleaned up (because their semantics are dependent on the type).
bool
lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
{
+ /* We cannot replace the result of an integer multiply which writes the
+ * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
+ * value whereas the MOV will act on only 32 or 33 bits of the
+ * accumulator.
+ */
+ assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
+ brw_reg_type_is_floating_point(inst->dst.type));
+
const fs_builder ibld(v, block, inst);
const unsigned stride = required_dst_byte_stride(inst) /
type_sz(inst->dst.type);
assert(stride > 0);
- const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride),
- stride);
+ fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
+ ibld.UNDEF(tmp);
+ tmp = horiz_stride(tmp, stride);
/* Emit a series of 32-bit integer copies from the temporary into the
* original destination.
progress |= lower_instruction(this, block, inst);
if (progress)
- invalidate_live_intervals();
+ invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
return progress;
}