if (inst->src[i].file != UNIFORM)
continue;
+ assert(type_sz(inst->src[i].type) % 4 == 0);
+ unsigned channel_size = type_sz(inst->src[i].type) / 4;
+
int reg = inst->src[i].nr;
for (int c = 0; c < 4; c++) {
if (!(readmask & (1 << c)))
continue;
- chans_used[reg] = MAX2(chans_used[reg],
- BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
+ unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
+ unsigned used = MAX2(chans_used[reg], channel * channel_size);
+ if (used <= 4)
+ chans_used[reg] = used;
+ else
+ chans_used[reg + 1] = used - 4;
}
}
int dst;
/* Find the lowest place we can slot this uniform in. */
for (dst = 0; dst < src; dst++) {
- if (chans_used[dst] + size <= 4)
- break;
+ if (chans_used[dst] + size <= 4)
+ break;
}
if (src == dst) {
- new_loc[src] = dst;
- new_chan[src] = 0;
+ new_loc[src] = dst;
+ new_chan[src] = 0;
} else {
- new_loc[src] = dst;
- new_chan[src] = chans_used[dst];
+ new_loc[src] = dst;
+ new_chan[src] = chans_used[dst];
- /* Move the references to the data */
- for (int j = 0; j < size; j++) {
- stage_prog_data->param[dst * 4 + new_chan[src] + j] =
- stage_prog_data->param[src * 4 + j];
- }
+ /* Move the references to the data */
+ for (int j = 0; j < size; j++) {
+ stage_prog_data->param[dst * 4 + new_chan[src] + j] =
+ stage_prog_data->param[src * 4 + j];
+ }
- chans_used[dst] += size;
- chans_used[src] = 0;
+ chans_used[dst] += size;
+ chans_used[src] = 0;
}
new_uniform_count = MAX2(new_uniform_count, dst + 1);
for (int i = 0 ; i < 3; i++) {
int src = inst->src[i].nr;
- if (inst->src[i].file != UNIFORM)
- continue;
+ if (inst->src[i].file != UNIFORM)
+ continue;
inst->src[i].nr = new_loc[src];
inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
(reg.type == BRW_REGISTER_TYPE_UD || \
reg.type == BRW_REGISTER_TYPE_D)
- /* "When source or destination datatype is 64b or operation is integer DWord
+ /* From the Cherryview and Broadwell PRMs:
+ *
+ * "When source or destination datatype is 64b or operation is integer DWord
* multiply, DepCtrl must not be used."
- * May apply to future SoCs as well.
+ *
+ * SKL PRMs don't include this restriction though.
*/
- if (devinfo->is_cherryview) {
+ if (devinfo->gen == 8 || devinfo->is_broxton) {
if (inst->opcode == BRW_OPCODE_MUL &&
IS_DWORD(inst->src[0]) &&
IS_DWORD(inst->src[1]))
if (inst->is_3src(devinfo)) {
/* 3-src instructions with scalar sources support arbitrary subnr,
* but don't actually use swizzles. Convert swizzle into subnr.
+ * Skip this for double-precision instructions: RepCtrl=1 is not
+ * allowed for them and needs special handling.
*/
for (int i = 0; i < 3; i++) {
- if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) {
+ if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
+ type_sz(inst->src[i].type) < 8) {
assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
}
return progress;
}
+bool
+vec4_visitor::lower_64bit_mad_to_mul_add()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ if (inst->opcode != BRW_OPCODE_MAD)
+ continue;
+
+ if (type_sz(inst->dst.type) != 8)
+ continue;
+
+ dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
+
+ /* Use the copy constructor so we copy all relevant instruction fields
+ * from the original mad into the add and mul instructions
+ */
+ vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
+ mul->opcode = BRW_OPCODE_MUL;
+ mul->dst = mul_dst;
+ mul->src[0] = inst->src[1];
+ mul->src[1] = inst->src[2];
+ mul->src[2].file = BAD_FILE;
+
+ vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
+ add->opcode = BRW_OPCODE_ADD;
+ add->src[0] = src_reg(mul_dst);
+ add->src[1] = inst->src[0];
+ add->src[2].file = BAD_FILE;
+
+ inst->insert_before(block, mul);
+ inst->insert_before(block, add);
+ inst->remove(block);
+
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
/* The align16 hardware can only do 32-bit swizzle channels, so we need to
* translate the logical 64-bit swizzle channels that we use in the Vec4 IR
* to 32-bit swizzle channels in hardware registers.
if (failed)
return false;
+ OPT(lower_64bit_mad_to_mul_add);
OPT(scalarize_df);
setup_payload();