if (entry->src.file == IMM)
return false;
assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
- entry->src.file == ATTR);
+ entry->src.file == ATTR || entry->src.file == FIXED_GRF);
if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
entry->dst, entry->size_written))
return false;
+ /* Avoid propagating a FIXED_GRF register into an EOT instruction in order
+ * for any register allocation restrictions to be applied.
+ */
+ if (entry->src.file == FIXED_GRF && inst->eot)
+ return false;
+
+ /* Avoid propagating odd-numbered FIXED_GRF registers into the first source
+ * of a LINTERP instruction on platforms where the PLN instruction has
+ * register alignment restrictions.
+ */
+ if (devinfo->has_pln && devinfo->gen <= 6 &&
+ entry->src.file == FIXED_GRF && (entry->src.nr & 1) &&
+ inst->opcode == FS_OPCODE_LINTERP && arg == 0)
+ return false;
+
/* we can't generally copy-propagate UD negations because we
* can end up accessing the resulting values as signed integers
* instead. See also resolve_ud_negate() and comment in
* derivatives, assume that their operands are packed so we can't
* generally propagate strided regions to them.
*/
- if (instruction_requires_packed_data(inst) && entry->src.stride > 1)
+ const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 :
+ entry->src.stride);
+ if (instruction_requires_packed_data(inst) && entry_stride > 1)
return false;
/* Bail if the result of composing both strides would exceed the
* hardware limit.
*/
- if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride,
+ if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride,
devinfo))
return false;
+ /* Bail if the source FIXED_GRF region of the copy cannot be trivially
+ * composed with the source region of the instruction -- E.g. because the
+ * copy uses some extended stride greater than 4 not supported natively by
+ * the hardware as a horizontal stride, or because instruction compression
+ * could require us to use a vertical stride shorter than a GRF.
+ */
+ if (entry->src.file == FIXED_GRF &&
+ (inst->src[arg].stride > 4 ||
+ inst->dst.component_size(inst->exec_size) >
+ inst->src[arg].component_size(inst->exec_size)))
+ return false;
+
/* Bail if the instruction type is larger than the execution type of the
* copy, what implies that each channel is reading multiple channels of the
* destination of the copy, and simply replacing the sources would give a
*
* Which would have different semantics.
*/
- if (entry->src.stride != 1 &&
+ if (entry_stride != 1 &&
(inst->src[arg].stride *
type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
return false;
}
}
+ /* Save the offset of inst->src[arg] relative to entry->dst for it to be
+ * applied later.
+ */
+ const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+
+ /* Fold the copy into the instruction consuming it. */
inst->src[arg].file = entry->src.file;
inst->src[arg].nr = entry->src.nr;
- inst->src[arg].stride *= entry->src.stride;
- inst->saturate = inst->saturate || entry->saturate;
+ inst->src[arg].subnr = entry->src.subnr;
+ inst->src[arg].offset = entry->src.offset;
+
+ /* Compose the strides of both regions. */
+ if (entry->src.file == FIXED_GRF) {
+ if (inst->src[arg].stride) {
+ const unsigned orig_width = 1 << entry->src.width;
+ const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) *
+ inst->src[arg].stride);
+ inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1;
+ inst->src[arg].hstride = cvt(inst->src[arg].stride);
+ inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width;
+ } else {
+ inst->src[arg].vstride = inst->src[arg].hstride =
+ inst->src[arg].width = 0;
+ }
- /* Compute the offset of inst->src[arg] relative to entry->dst */
- const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+ inst->src[arg].stride = 1;
+
+ /* Hopefully no Align16 around here... */
+ assert(entry->src.swizzle == BRW_SWIZZLE_XYZW);
+ inst->src[arg].swizzle = entry->src.swizzle;
+ } else {
+ inst->src[arg].stride *= entry->src.stride;
+ }
+
+ /* Compose any saturate modifiers. */
+ inst->saturate = inst->saturate || entry->saturate;
/* Compute the first component of the copy that the instruction is
* reading, and the base byte offset within that component.
/* Calculate the byte offset at the origin of the copy of the given
* component and suboffset.
*/
- inst->src[arg].offset = suboffset +
- component * entry->src.stride * type_sz(entry->src.type) +
- entry->src.offset;
+ inst->src[arg] = byte_offset(inst->src[arg],
+ component * entry_stride * type_sz(entry->src.type) + suboffset);
if (has_source_modifiers) {
if (entry->dst.type != inst->src[arg].type) {
inst->src[0], inst->size_read(0))) ||
inst->src[0].file == ATTR ||
inst->src[0].file == UNIFORM ||
- inst->src[0].file == IMM) &&
+ inst->src[0].file == IMM ||
+ (inst->src[0].file == FIXED_GRF &&
+ inst->src[0].is_contiguous())) &&
inst->src[0].type == inst->dst.type &&
!inst->is_partial_write());
}
}
/* kill the destination from the ACP */
- if (inst->dst.file == VGRF) {
+ if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
if (regions_overlap(entry->dst, entry->size_written,
inst->dst, inst->size_written))
assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
const unsigned size_written = effective_width *
type_sz(inst->src[i].type);
- if (inst->src[i].file == VGRF) {
+ if (inst->src[i].file == VGRF ||
+ (inst->src[i].file == FIXED_GRF &&
+ inst->src[i].is_contiguous())) {
acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
entry->dst = byte_offset(inst->dst, offset);
entry->src = inst->src[i];