fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
int header_size)
{
+ assert(dst.width % 8 == 0);
+ fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
+ dst, src, sources);
+ inst->header_size = header_size;
+
for (int i = 0; i < header_size; i++)
assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
+ inst->regs_written = header_size;
- uint8_t exec_size = dst.width;
- for (int i = 0; i < sources; ++i) {
- assert(src[i].width % dst.width == 0);
- if (src[i].width > exec_size)
- exec_size = src[i].width;
- }
-
- fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
- dst, src, sources);
- inst->regs_written = 0;
- for (int i = 0; i < sources; ++i) {
- /* The LOAD_PAYLOAD instruction only really makes sense if we are
- * dealing with whole registers. If this ever changes, we can deal
- * with it later.
- */
- int size = inst->src[i].effective_width * type_sz(src[i].type);
- assert(size % 32 == 0);
- inst->regs_written += (size + 31) / 32;
- }
+ for (int i = header_size; i < sources; ++i)
+ assert(src[i].file != GRF || src[i].width == dst.width);
+ inst->regs_written += (sources - header_size) * (dst.width / 8);
return inst;
}
if (grf_alloc.sizes[reg.reg] != this->regs_written)
return false;
- for (int i = 1; i < this->sources; i++)
- if (!this->src[i].equals(::offset(reg, i)))
+ for (int i = 0; i < this->sources; i++) {
+ reg.type = this->src[i].type;
+ reg.width = this->src[i].width;
+ if (!this->src[i].equals(reg))
return false;
+ reg = ::offset(reg, 1);
+ }
return true;
}
{
bool progress = false;
- int vgrf_to_reg[alloc.count];
- int reg_count = 0;
- for (unsigned i = 0; i < alloc.count; ++i) {
- vgrf_to_reg[i] = reg_count;
- reg_count += alloc.sizes[i];
- }
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ continue;
- struct {
- bool written:1; /* Whether this register has ever been written */
- bool force_writemask_all:1;
- bool force_sechalf:1;
- } metadata[reg_count];
- memset(metadata, 0, sizeof(metadata));
+ assert(inst->dst.file == MRF || inst->dst.file == GRF);
+ assert(inst->saturate == false);
- foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
- if (inst->dst.file == GRF) {
- const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
- bool force_sechalf = inst->force_sechalf &&
- !inst->force_writemask_all;
- bool toggle_sechalf = inst->dst.width == 16 &&
- type_sz(inst->dst.type) == 4 &&
- !inst->force_writemask_all;
- for (int i = 0; i < inst->regs_written; ++i) {
- metadata[dst_reg + i].written = true;
- metadata[dst_reg + i].force_sechalf = force_sechalf;
- metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
- force_sechalf = (toggle_sechalf != force_sechalf);
+ fs_reg dst = inst->dst;
+
+ /* Get rid of COMPR4. We'll add it back in if we need it */
+ if (dst.file == MRF)
+ dst.reg = dst.reg & ~BRW_MRF_COMPR4;
+
+ dst.width = 8;
+ for (uint8_t i = 0; i < inst->header_size; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
+ fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
+ mov_src.width = 8;
+ fs_inst *mov = MOV(mov_dst, mov_src);
+ mov->force_writemask_all = true;
+ inst->insert_before(block, mov);
}
+ dst = offset(dst, 1);
}
- if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
- assert(inst->dst.file == MRF || inst->dst.file == GRF);
- fs_reg dst = inst->dst;
-
- for (int i = 0; i < inst->sources; i++) {
- dst.width = inst->src[i].effective_width;
- dst.type = inst->src[i].type;
-
- if (inst->src[i].file == BAD_FILE) {
- /* Do nothing but otherwise increment as normal */
- } else if (dst.file == MRF &&
- dst.width == 8 &&
- devinfo->has_compr4 &&
- i + 4 < inst->sources &&
- inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
- fs_reg compr4_dst = dst;
- compr4_dst.reg += BRW_MRF_COMPR4;
- compr4_dst.width = 16;
- fs_reg compr4_src = inst->src[i];
- compr4_src.width = 16;
- fs_inst *mov = MOV(compr4_dst, compr4_src);
- mov->force_writemask_all = true;
- inst->insert_before(block, mov);
- /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
- inst->src[i + 4].file = BAD_FILE;
- } else {
- fs_inst *mov = MOV(dst, inst->src[i]);
- if (inst->src[i].file == GRF) {
- int src_reg = vgrf_to_reg[inst->src[i].reg] +
- inst->src[i].reg_offset;
- mov->force_sechalf = metadata[src_reg].force_sechalf;
- mov->force_writemask_all = metadata[src_reg].force_writemask_all;
+ dst.width = inst->exec_size;
+ if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
+ inst->exec_size > 8) {
+ /* In this case, the payload portion of the LOAD_PAYLOAD isn't
+ * a straightforward copy. Instead, the result of the
+ * LOAD_PAYLOAD is treated as interleaved and the first four
+ * non-header sources are unpacked as:
+ *
+ * m + 0: r0
+ * m + 1: g0
+ * m + 2: b0
+ * m + 3: a0
+ * m + 4: r1
+ * m + 5: g1
+ * m + 6: b1
+ * m + 7: a1
+ *
+ * This is used for gen <= 5 fb writes.
+ */
+ assert(inst->exec_size == 16);
+ assert(inst->header_size + 4 <= inst->sources);
+ for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ if (devinfo->has_compr4) {
+ fs_reg compr4_dst = retype(dst, inst->src[i].type);
+ compr4_dst.reg |= BRW_MRF_COMPR4;
+
+ fs_inst *mov = MOV(compr4_dst, inst->src[i]);
+ mov->force_writemask_all = inst->force_writemask_all;
+ inst->insert_before(block, mov);
} else {
- /* We don't have any useful metadata for immediates or
- * uniforms. Assume that any of the channels of the
- * destination may be used.
- */
- assert(inst->src[i].file == IMM ||
- inst->src[i].file == UNIFORM);
- mov->force_writemask_all = true;
+ /* Platform doesn't have COMPR4. We have to fake it */
+ fs_reg mov_dst = retype(dst, inst->src[i].type);
+ mov_dst.width = 8;
+
+ fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
+ mov->force_writemask_all = inst->force_writemask_all;
+ inst->insert_before(block, mov);
+
+ mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
+ mov->force_writemask_all = inst->force_writemask_all;
+ mov->force_sechalf = true;
+ inst->insert_before(block, mov);
}
-
- if (dst.file == GRF) {
- const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
- const bool force_writemask = mov->force_writemask_all;
- metadata[dst_reg].force_writemask_all = force_writemask;
- metadata[dst_reg].force_sechalf = mov->force_sechalf;
- if (dst.width * type_sz(dst.type) > 32) {
- assert(!mov->force_sechalf);
- metadata[dst_reg + 1].force_writemask_all = force_writemask;
- metadata[dst_reg + 1].force_sechalf = !force_writemask;
- }
- }
-
- inst->insert_before(block, mov);
}
- dst = offset(dst, 1);
+ dst.reg++;
}
- inst->remove(block);
- progress = true;
+ /* The loop above only ever incremented us through the first set
+ * of 4 registers. However, thanks to the magic of COMPR4, we
+ * actually wrote to the first 8 registers, so we need to take
+ * that into account now.
+ */
+ dst.reg += 4;
+
+ /* The COMPR4 code took care of the first 4 sources. We'll let
+ * the regular path handle any remaining sources. Yes, we are
+ * modifying the instruction but we're about to delete it so
+ * this really doesn't hurt anything.
+ */
+ inst->header_size += 4;
}
+
+ for (uint8_t i = inst->header_size; i < inst->sources; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ fs_inst *mov = MOV(retype(dst, inst->src[i].type),
+ inst->src[i]);
+ mov->force_writemask_all = inst->force_writemask_all;
+ inst->insert_before(block, mov);
+ }
+ dst = offset(dst, 1);
+ }
+
+ inst->remove(block);
+ progress = true;
}
if (progress)
bool optimize_frontfacing_ternary(nir_alu_instr *instr,
const fs_reg &result);
- int setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
- bool use_2nd_half);
+ void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
+ unsigned exec_size, bool use_2nd_half);
void emit_alpha_test();
fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2,
fs_reg src0_alpha, unsigned components,
{
int written = inst->regs_written;
int dst_width = inst->dst.width / 8;
- fs_reg dst = inst->dst;
fs_inst *copy;
if (written > dst_width) {
- fs_reg *sources = ralloc_array(v->mem_ctx, fs_reg, written / dst_width);
- for (int i = 0; i < written / dst_width; i++)
- sources[i] = offset(src, i);
- copy = v->LOAD_PAYLOAD(dst, sources, written / dst_width,
- inst->header_size);
+ fs_reg *payload;
+ int sources, header_size;
+ if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+ sources = inst->sources;
+ header_size = inst->header_size;
+ } else {
+ assert(written % dst_width == 0);
+ sources = written / dst_width;
+ header_size = 0;
+ }
+
+ assert(src.file == GRF);
+ payload = ralloc_array(v->mem_ctx, fs_reg, sources);
+ for (int i = 0; i < header_size; i++) {
+ payload[i] = src;
+ payload[i].width = 8;
+ src.reg_offset++;
+ }
+ for (int i = header_size; i < sources; i++) {
+ payload[i] = src;
+ src = offset(src, 1);
+ }
+ copy = v->LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
} else {
- copy = v->MOV(dst, src);
+ copy = v->MOV(inst->dst, src);
copy->force_writemask_all = inst->force_writemask_all;
copy->src[0].negate = negate;
}
mlen = length * reg_width;
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
- BRW_REGISTER_TYPE_F);
+ BRW_REGISTER_TYPE_F, dispatch_width);
emit(LOAD_PAYLOAD(src_payload, sources, length, header_size));
/* Generate the SEND */
{
int reg_width = dispatch_width / 8;
fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
- BRW_REGISTER_TYPE_F);
+ BRW_REGISTER_TYPE_F, dispatch_width);
fs_reg dest = vgrf(glsl_type::uvec4_type);
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
int mlen = 1 + (length - 1) * reg_width;
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
- BRW_REGISTER_TYPE_UD);
+ BRW_REGISTER_TYPE_UD, dispatch_width);
emit(LOAD_PAYLOAD(src_payload, sources, length, 1));
/* Emit the instruction. */
int mlen = 1 + reg_width;
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
- BRW_REGISTER_TYPE_UD);
+ BRW_REGISTER_TYPE_UD, dispatch_width);
fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1));
/* Emit the instruction. */
this->current_annotation = NULL;
}
-int
+void
fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
- bool use_2nd_half)
+ unsigned exec_size, bool use_2nd_half)
{
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
fs_inst *inst;
- if (color.file == BAD_FILE) {
- return 4 * (dispatch_width / 8);
- }
-
- uint8_t colors_enabled;
- if (components == 0) {
- /* We want to write one component to the alpha channel */
- colors_enabled = 0x8;
- } else {
- /* Enable the first components-many channels */
- colors_enabled = (1 << components) - 1;
+ if (key->clamp_fragment_color) {
+ fs_reg tmp = vgrf(glsl_type::vec4_type);
+ assert(color.type == BRW_REGISTER_TYPE_F);
+ for (unsigned i = 0; i < components; i++) {
+ inst = emit(MOV(offset(tmp, i), offset(color, i)));
+ inst->saturate = true;
+ }
+ color = tmp;
}
- if (dispatch_width == 8 || (devinfo->gen >= 6 && !do_dual_src)) {
- /* SIMD8 write looks like:
- * m + 0: r0
- * m + 1: r1
- * m + 2: g0
- * m + 3: g1
- *
- * gen6 SIMD16 DP write looks like:
- * m + 0: r0
- * m + 1: r1
- * m + 2: g0
- * m + 3: g1
- * m + 4: b0
- * m + 5: b1
- * m + 6: a0
- * m + 7: a1
- */
- int len = 0;
- for (unsigned i = 0; i < 4; ++i) {
- if (colors_enabled & (1 << i)) {
- dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
- color.type, color.width);
- inst = emit(MOV(dst[len], offset(color, i)));
- inst->saturate = key->clamp_fragment_color;
- } else if (color.width == 16) {
- /* We need two BAD_FILE slots for a 16-wide color */
- len++;
- }
- len++;
- }
- return len;
- } else if (devinfo->gen >= 6 && do_dual_src) {
- /* SIMD16 dual source blending for gen6+.
- *
- * From the SNB PRM, volume 4, part 1, page 193:
- *
- * "The dual source render target messages only have SIMD8 forms due to
- * maximum message length limitations. SIMD16 pixel shaders must send two
- * of these messages to cover all of the pixels. Each message contains
- * two colors (4 channels each) for each pixel in the message payload."
- *
- * So in SIMD16 dual source blending we will send 2 SIMD8 messages,
- * each one will call this function twice (one for each color involved),
- * so in each pass we only write 4 registers. Notice that the second
- * SIMD8 message needs to read color data from the 2nd half of the color
- * registers, so it needs to call this with use_2nd_half = true.
- */
- for (unsigned i = 0; i < 4; ++i) {
- if (colors_enabled & (1 << i)) {
- dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
- inst = emit(MOV(dst[i], half(offset(color, i),
- use_2nd_half ? 1 : 0)));
- inst->saturate = key->clamp_fragment_color;
- if (use_2nd_half)
- inst->force_sechalf = true;
- }
- }
- return 4;
+ if (exec_size < dispatch_width) {
+ unsigned half_idx = use_2nd_half ? 1 : 0;
+ for (unsigned i = 0; i < components; i++)
+ dst[i] = half(offset(color, i), half_idx);
} else {
- /* pre-gen6 SIMD16 single source DP write looks like:
- * m + 0: r0
- * m + 1: g0
- * m + 2: b0
- * m + 3: a0
- * m + 4: r1
- * m + 5: g1
- * m + 6: b1
- * m + 7: a1
- */
- for (unsigned i = 0; i < 4; ++i) {
- if (colors_enabled & (1 << i)) {
- dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
- inst = emit(MOV(dst[i], half(offset(color, i), 0)));
- inst->saturate = key->clamp_fragment_color;
-
- dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
- inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
- inst->saturate = key->clamp_fragment_color;
- inst->force_sechalf = true;
- }
- }
- return 8;
+ for (unsigned i = 0; i < components; i++)
+ dst[i] = offset(color, i);
}
}
this->current_annotation = "FB write header";
int header_size = 2, payload_header_size;
- int reg_size = exec_size / 8;
/* We can potentially have a message length of up to 15, so we have to set
* base_mrf to either 0 or 1 in order to fit in m0..m15.
* alpha out the pipeline to our null renderbuffer to support
* alpha-testing, alpha-to-coverage, and so on.
*/
- length += setup_color_payload(sources + length, this->outputs[0], 0,
- false);
+ if (this->outputs[0].file != BAD_FILE)
+ setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3),
+ 1, exec_size, false);
+ length += 4;
} else if (color1.file == BAD_FILE) {
if (src0_alpha.file != BAD_FILE) {
- sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
- src0_alpha.type, src0_alpha.width);
- fs_inst *inst = emit(MOV(sources[length], src0_alpha));
- inst->saturate = key->clamp_fragment_color;
+ setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
length++;
}
- length += setup_color_payload(sources + length, color0, components,
- false);
+ setup_color_payload(&sources[length], color0, components,
+ exec_size, use_2nd_half);
+ length += 4;
} else {
- length += setup_color_payload(sources + length, color0, components,
- use_2nd_half);
- length += setup_color_payload(sources + length, color1, components,
- use_2nd_half);
+ setup_color_payload(&sources[length], color0, components,
+ exec_size, use_2nd_half);
+ length += 4;
+ setup_color_payload(&sources[length], color1, components,
+ exec_size, use_2nd_half);
+ length += 4;
}
if (source_depth_to_render_target) {
no16("Missing support for simd16 depth writes on gen6\n");
}
- sources[length] = vgrf(glsl_type::float_type);
if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
/* Hand over gl_FragDepth. */
assert(this->frag_depth.file != BAD_FILE);
- emit(MOV(sources[length], this->frag_depth));
+ sources[length] = this->frag_depth;
} else {
/* Pass through the payload depth. */
- emit(MOV(sources[length],
- fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
+ sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
}
length++;
}
- if (payload.dest_depth_reg) {
- sources[length] = vgrf(glsl_type::float_type);
- emit(MOV(sources[length],
- fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
- length++;
- }
+ if (payload.dest_depth_reg)
+ sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
fs_inst *load;
fs_inst *write;
if (devinfo->gen >= 7) {
/* Send from the GRF */
- fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
+ fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size));
payload.reg = alloc.allocate(load->regs_written);
- payload.width = dispatch_width;
load->dst = payload;
write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
write->base_mrf = -1;
} else {
/* Send from the MRF */
- load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+ load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
sources, length, payload_header_size));
+
+ /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
+ * will do this for us if we just give it a COMPR4 destination.
+ */
+ if (brw->gen < 6 && exec_size == 16)
+ load->dst.reg |= BRW_MRF_COMPR4;
+
write = emit(FS_OPCODE_FB_WRITE);
write->exec_size = exec_size;
write->base_mrf = 1;
if (flush) {
fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
- BRW_REGISTER_TYPE_F);
+ BRW_REGISTER_TYPE_F, dispatch_width);
/* We need WE_all on the MOV for the message header (the URB handles)
* so do a MOV to a dummy register and set force_writemask_all on the