{
memset(this, 0, sizeof(*this));
this->smear = -1;
+ stride = 1;
}
/** Generic unset register constructor. */
memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
sizeof(fixed_hw_reg)) == 0 &&
smear == r.smear &&
+ stride == r.stride &&
imm.u == r.imm.u);
}
return result;
}
+fs_reg &
+fs_reg::apply_stride(unsigned stride)
+{
+ assert((this->stride * stride) <= 4 &&
+ (is_power_of_two(stride) || stride == 0) &&
+ file != HW_REG && file != IMM);
+ this->stride *= stride;
+ return *this;
+}
+
+bool
+fs_reg::is_contiguous() const
+{
+ return stride == 1;
+}
+
bool
fs_reg::is_zero() const
{
{
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
this->force_uncompressed ||
- this->force_sechalf);
+ this->force_sechalf || !this->dst.is_contiguous());
}
int
inst->src[0].negate ||
inst->src[0].abs ||
inst->src[0].smear != -1 ||
+ !inst->src[0].is_contiguous() ||
inst->dst.file != GRF ||
inst->dst.type != inst->src[0].type) {
continue;
inst->dst.file != MRF || inst->src[0].file != GRF ||
inst->dst.type != inst->src[0].type ||
inst->src[0].abs || inst->src[0].negate ||
- inst->src[0].smear != -1 || inst->src[0].subreg_offset)
+ inst->src[0].smear != -1 || !inst->src[0].is_contiguous() ||
+ inst->src[0].subreg_offset)
continue;
/* Work out which hardware MRF registers are written by this
bool is_one() const;
bool is_null() const;
bool is_valid_3src() const;
+ bool is_contiguous() const;
fs_reg retype(uint32_t type);
+ fs_reg &apply_stride(unsigned stride);
/** Register file: GRF, MRF, IMM. */
enum register_file file;
*/
int subreg_offset;
+ /** Register region horizontal stride */
+ int stride;
+
fs_reg *reladdr;
};
if (entry->src.file == IMM)
return false;
- if (inst->regs_read(this, arg) > 1)
+ /* Bail if inst is reading more than entry is writing. */
+ if ((inst->regs_read(this, arg) * inst->src[arg].stride *
+ type_sz(inst->src[arg].type)) > type_sz(entry->dst.type))
return false;
if (inst->src[arg].file != entry->dst.file ||
bool has_source_modifiers = entry->src.abs || entry->src.negate;
if ((has_source_modifiers || entry->src.file == UNIFORM ||
- entry->src.smear != -1) && !can_do_source_mods(inst))
+ entry->src.smear != -1 || !entry->src.is_contiguous()) &&
+ !can_do_source_mods(inst))
+ return false;
+
+ /* Bail if the result of composing both strides would exceed the
+ * hardware limit.
+ */
+ if (entry->src.stride * inst->src[arg].stride > 4)
+ return false;
+
+ /* Bail if the result of composing both strides cannot be expressed
+ * as another stride. This avoids, for example, trying to transform
+ * this:
+ *
+ * MOV (8) rX<1>UD rY<0;1,0>UD
+ * FOO (8) ... rX<8;8,1>UW
+ *
+ * into this:
+ *
+ * FOO (8) ... rY<0;1,0>UW
+ *
+ * Which would have different semantics.
+ */
+ if (entry->src.stride != 1 &&
+ (inst->src[arg].stride *
+ type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
return false;
if (has_source_modifiers && entry->dst.type != inst->src[arg].type)
if (entry->src.smear != -1)
inst->src[arg].smear = entry->src.smear;
inst->src[arg].subreg_offset = entry->src.subreg_offset;
+ inst->src[arg].stride *= entry->src.stride;
if (!inst->src[arg].abs) {
inst->src[arg].abs = entry->src.abs;
if (inst->src[i].file != entry->dst.file ||
inst->src[i].reg != entry->dst.reg ||
inst->src[i].reg_offset != entry->dst.reg_offset ||
- inst->src[i].subreg_offset != entry->dst.subreg_offset)
+ inst->src[i].subreg_offset != entry->dst.subreg_offset ||
+ inst->src[i].type != entry->dst.type ||
+ inst->src[i].stride > 1)
continue;
/* Don't bother with cases that should have been taken care of by the
switch (reg->file) {
case GRF:
case MRF:
- if (reg->smear == -1) {
- brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
+ if (reg->stride == 0 || reg->smear >= 0) {
+ brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
} else {
- brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
+ brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
+ brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
}
+
brw_reg = retype(brw_reg, reg->type);
if (reg->sechalf)
brw_reg = sechalf(brw_reg);
* would get stomped by the first decode as well.
*/
int end_ip = ip;
- if (v->dispatch_width == 16 && (reg.smear != -1 ||
+ if (v->dispatch_width == 16 && (reg.smear != -1 || reg.stride == 0 ||
(v->pixel_x.reg == reg.reg ||
v->pixel_y.reg == reg.reg))) {
end_ip++;
* loading pull constants, so spilling them is unlikely to reduce
* register pressure anyhow.
*/
- if (inst->src[i].smear >= 0) {
+ if (inst->src[i].smear >= 0 || !inst->src[i].is_contiguous()) {
no_spill[inst->src[i].reg] = true;
}
}
if (inst->dst.file == GRF) {
spill_costs[inst->dst.reg] += inst->regs_written * loop_scale;
- if (inst->dst.smear >= 0) {
+ if (inst->dst.smear >= 0 || !inst->dst.is_contiguous()) {
no_spill[inst->dst.reg] = true;
}
}