struct brw_reg gdst = suboffset(dst, group);
struct brw_reg dst_d = retype(spread(gdst, 2),
BRW_REGISTER_TYPE_D);
+ assert(dst.hstride == 1);
brw_MOV(p, dst_d,
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
brw_MOV(p, byte_offset(dst_d, 4),
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
} else {
- brw_MOV(p, suboffset(dst, group),
+ brw_MOV(p, suboffset(dst, group * dst.hstride),
retype(brw_VxH_indirect(0, 0), src.type));
}
}
break;
case SHADER_OPCODE_CLUSTER_BROADCAST: {
- assert(src[0].type == dst.type);
assert(!src[0].negate && !src[0].abs);
assert(src[1].file == BRW_IMMEDIATE_VALUE);
assert(src[1].type == BRW_REGISTER_TYPE_UD);
assert(src[2].type == BRW_REGISTER_TYPE_UD);
const unsigned component = src[1].ud;
const unsigned cluster_size = src[2].ud;
+ unsigned vstride = cluster_size;
+ unsigned width = cluster_size;
+
+ /* The maximum exec_size is 32, but the maximum width is only 16. */
+ if (inst->exec_size == width) {
+ vstride = 0;
+ width = 1;
+ }
+
struct brw_reg strided = stride(suboffset(src[0], component),
- cluster_size, cluster_size, 0);
+ vstride, width, 0);
if (type_sz(src[0].type) > 4 &&
(devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
/* IVB has an issue (which we found empirically) where it reads
* indirect here to handle adding 4 bytes to the offset and avoid
* the extra ADD to the register file.
*/
+ assert(src[0].type == dst.type);
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
subscript(strided, BRW_REGISTER_TYPE_D, 0));
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),