#define BRW_SWIZZLE_ZZZZ BRW_SWIZZLE4(2,2,2,2)
#define BRW_SWIZZLE_WWWW BRW_SWIZZLE4(3,3,3,3)
#define BRW_SWIZZLE_XYXY BRW_SWIZZLE4(0,1,0,1)
+#define BRW_SWIZZLE_YXYX BRW_SWIZZLE4(1,0,1,0)
#define BRW_SWIZZLE_XZXZ BRW_SWIZZLE4(0,2,0,2)
#define BRW_SWIZZLE_YZXW BRW_SWIZZLE4(1,2,0,3)
#define BRW_SWIZZLE_YWYW BRW_SWIZZLE4(1,3,1,3)
#define BRW_SWIZZLE_ZXYW BRW_SWIZZLE4(2,0,1,3)
#define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3)
+#define BRW_SWIZZLE_WZWZ BRW_SWIZZLE4(3,2,3,2)
#define BRW_SWIZZLE_WZYX BRW_SWIZZLE4(3,2,1,0)
#define BRW_SWIZZLE_XXZZ BRW_SWIZZLE4(0,0,2,2)
#define BRW_SWIZZLE_YYWW BRW_SWIZZLE4(1,1,3,3)
}
}
+/* Gen7 has a hardware decompression bug that we can exploit to represent
+ * handful of additional swizzles natively.
+ */
+static bool
+is_gen7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg)
+{
+ switch (inst->src[arg].swizzle) {
+ case BRW_SWIZZLE_XXXX:
+ case BRW_SWIZZLE_YYYY:
+ case BRW_SWIZZLE_ZZZZ:
+ case BRW_SWIZZLE_WWWW:
+ case BRW_SWIZZLE_XYXY:
+ case BRW_SWIZZLE_YXYX:
+ case BRW_SWIZZLE_ZWZW:
+ case BRW_SWIZZLE_WZWZ:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* 64-bit sources use regions with a width of 2. These 2 elements in each row
* can be addressed using 32-bit swizzles (which is what the hardware supports)
* but it also means that the swizzle we apply on the first two components of a
* dvec4 is coupled with the swizzle we use for the last 2. In other words,
* only some specific swizzle combinations can be natively supported.
*
- * FIXME: We can also exploit the vstride 0 decompression bug in gen7 to
- * implement some more swizzles via simple translations. For
- * example: XXXX as XYXY, YYYY as ZWZW (same for ZZZZ and WWWW by
- * using subnr), XYXY as XYZW, YXYX as ZWXY (same for ZWZW and
- * WZWZ using subnr).
- *
* FIXME: we can go an step further and implement even more swizzle
* variations using only partial scalarization.
*
* https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82
*/
bool
-vec4_visitor::is_supported_64bit_region(src_reg src)
+vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg)
{
+ const src_reg &src = inst->src[arg];
assert(type_sz(src.type) == 8);
/* Uniform regions have a vstride=0. Because we use 2-wide rows with
case BRW_SWIZZLE_YXWZ:
return true;
default:
- return false;
+ return devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg);
}
}
for (unsigned i = 0; i < 3; i++) {
if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8)
continue;
- skip_lowering = skip_lowering &&
- is_supported_64bit_region(inst->src[i]);
+ skip_lowering = skip_lowering && is_supported_64bit_region(inst, i);
}
}
/* Take the 64-bit logical swizzle channel and translate it to 32-bit */
assert(brw_is_single_value_swizzle(reg.swizzle) ||
- is_supported_64bit_region(reg));
+ is_supported_64bit_region(inst, arg));
- if (is_supported_64bit_region(reg)) {
+ if (is_supported_64bit_region(inst, arg) &&
+ !is_gen7_supported_64bit_swizzle(inst, arg)) {
/* Supported 64-bit swizzles are those such that their first two
* components, when expanded to 32-bit swizzles, match the semantics
* of the original 64-bit swizzle with 2-wide row regioning.
hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
swizzle1 * 2, swizzle1 * 2 + 1);
} else {
- /* If we got here then we have an unsupported swizzle and the
- * instruction should have been scalarized.
+ /* If we got here then we have one of the following:
+ *
+ * 1. An unsupported swizzle, which should be single-value thanks to the
+ * scalarization pass.
+ *
+ * 2. A gen7 supported swizzle. These can be single-value or double-value
+ * swizzles. If the latter, they are never cross-dvec2 channels. For
+ * these we always need to activate the gen7 vstride=0 exploit.
*/
- assert(brw_is_single_value_swizzle(reg.swizzle));
- unsigned swizzle = BRW_GET_SWZ(reg.swizzle, 0);
+ unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
+ unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
+ assert((swizzle0 < 2) == (swizzle1 < 2));
/* To gain access to Z/W components we need to select the second half
* of the register and then use a X/Y swizzle to select Z/W respectively.
*/
- if (swizzle >= 2) {
+ if (swizzle0 >= 2) {
*hw_reg = suboffset(*hw_reg, 2);
- swizzle -= 2;
+ swizzle0 -= 2;
+ swizzle1 -= 2;
}
+ /* All gen7-specific supported swizzles require the vstride=0 exploit */
+ if (devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg))
+ hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+
/* Any 64-bit source with an offset at 16B is intended to address the
* second half of a register and needs a vertical stride of 0 so we:
*
hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
}
- hw_reg->swizzle = BRW_SWIZZLE4(swizzle * 2, swizzle * 2 + 1,
- swizzle * 2, swizzle * 2 + 1);
+ hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
+ swizzle1 * 2, swizzle1 * 2 + 1);
}
}