- assert(inst->exec_size / 2 * reg->stride * type_sz(reg->type) <= 32);
- brw_reg = brw_vecn_reg(inst->exec_size / 2, brw_file_from_reg(reg),
- reg->nr, 0);
- brw_reg = stride(brw_reg, inst->exec_size / 2 * reg->stride,
- inst->exec_size / 2, reg->stride);
+ const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
+
+ /* Because the hardware can only split source regions at a whole
+ * multiple of width during decompression (i.e. vertically), clamp
+ * the value obtained above to the physical execution size of a
+ * single decompressed chunk of the instruction:
+ */
+ const unsigned phys_width = compressed ? inst->exec_size / 2 :
+ inst->exec_size;
+
+ /* XXX - The equation above is strictly speaking not correct on
+ * hardware that supports unbalanced GRF writes -- On Gen9+
+ * each decompressed chunk of the instruction may have a
+ * different execution size when the number of components
+ * written to each destination GRF is not the same.
+ */
+ const unsigned width = MIN2(reg_width, phys_width);
+ brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
+ brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);