+ case VEC4_OPCODE_PACK_BYTES: {
+ /* Is effectively:
+ *
+ * mov(8) dst<16,4,1>:UB src<4,1,0>:UB
+ *
+ * but destinations' only regioning is horizontal stride, so instead we
+ * have to use two instructions:
+ *
+ * mov(4) dst<1>:UB src<4,1,0>:UB
+ * mov(4) dst.16<1>:UB src.16<4,1,0>:UB
+ *
+ * where they pack the four bytes from the low and high four DW.
+ */
+ assert(is_power_of_two(dst.dw1.bits.writemask) &&
+ dst.dw1.bits.writemask != 0);
+ unsigned offset = __builtin_ctz(dst.dw1.bits.writemask);
+
+ dst.type = BRW_REGISTER_TYPE_UB;
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ src[0].type = BRW_REGISTER_TYPE_UB;
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].width = BRW_WIDTH_1;
+ src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
+ dst.subnr = offset * 4;
+ struct brw_inst *insn = brw_MOV(p, dst, src[0]);
+ brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_4);
+ brw_inst_set_no_dd_clear(brw, insn, true);
+ brw_inst_set_no_dd_check(brw, insn, inst->no_dd_check);
+
+ src[0].subnr = 16;
+ dst.subnr = 16 + offset * 4;
+ insn = brw_MOV(p, dst, src[0]);
+ brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_4);
+ brw_inst_set_no_dd_clear(brw, insn, inst->no_dd_clear);
+ brw_inst_set_no_dd_check(brw, insn, true);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+