else
return 1;
}
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ return (i == 0 ? 2 : 1);
default:
return 1;
case SHADER_OPCODE_TYPED_SURFACE_WRITE:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
- case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
case SHADER_OPCODE_BYTE_SCATTERED_READ:
if (arg == 0)
/* gl_FragCoord.z */
if (devinfo->gen >= 6) {
- bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
+ bld.MOV(wpos, fetch_payload_reg(bld, payload.source_depth_reg));
} else {
bld.emit(FS_OPCODE_LINTERP, wpos,
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
* The X, Y sample positions come in as bytes in thread payload. So, read
* the positions using vstride=16, width=8, hstride=2.
*/
- struct brw_reg sample_pos_reg =
- stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
- BRW_REGISTER_TYPE_B), 16, 8, 2);
+ const fs_reg sample_pos_reg =
+ fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
- if (dispatch_width == 8) {
- abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
- } else {
- abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
- abld.half(1).MOV(half(int_sample_x, 1),
- fs_reg(suboffset(sample_pos_reg, 16)));
- }
/* Compute gl_SamplePosition.x */
- compute_sample_position(pos, int_sample_x);
- pos = offset(pos, abld, 1);
- if (dispatch_width == 8) {
- abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
- } else {
- abld.half(0).MOV(half(int_sample_y, 0),
- fs_reg(suboffset(sample_pos_reg, 1)));
- abld.half(1).MOV(half(int_sample_y, 1),
- fs_reg(suboffset(sample_pos_reg, 17)));
- }
+ abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
+ compute_sample_position(offset(pos, abld, 0), int_sample_x);
+
/* Compute gl_SamplePosition.y */
- compute_sample_position(pos, int_sample_y);
+ abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
+ compute_sample_position(offset(pos, abld, 1), int_sample_y);
return reg;
}
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
- fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
- BRW_REGISTER_TYPE_D));
+ fs_reg coverage_mask =
+ fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);
if (wm_prog_data->persample_dispatch) {
/* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
{
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- if (stage != MESA_SHADER_FRAGMENT)
+ if (stage != MESA_SHADER_FRAGMENT || dispatch_width > 16)
return false;
if (devinfo->gen != 9 && !devinfo->is_cherryview)
unsigned length = 0;
if (devinfo->gen < 6) {
+ /* TODO: Support SIMD32 on gen4-5 */
+ assert(bld.group() < 16);
+
/* For gen4-5, we always have a header consisting of g0 and g1. We have
* an implied MOV from g0,g1 to the start of the message. The MOV from
* g0 is handled by the hardware and the MOV from g1 is provided by the
*/
const fs_builder ubld = bld.exec_all().group(8, 0);
- /* The header starts off as g0 and g1 */
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
- BRW_REGISTER_TYPE_UD));
+ if (bld.group() < 16) {
+ /* The header starts off as g0 and g1 for the first half */
+ ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+ BRW_REGISTER_TYPE_UD));
+ } else {
+ /* The header starts off as g0 and g2 for the second half */
+ assert(bld.group() < 32);
+ const fs_reg header_sources[2] = {
+ retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+ retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
+ };
+ ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
+ }
uint32_t g00_bits = 0;
}
if (prog_data->uses_kill) {
+ assert(bld.group() < 16);
ubld.group(1, 0).MOV(retype(component(header, 15),
BRW_REGISTER_TYPE_UW),
brw_flag_reg(0, 1));
assert(length == 0 || length == 2);
header_size = length;
- if (payload.aa_dest_stencil_reg) {
+ if (payload.aa_dest_stencil_reg[0]) {
+ assert(inst->group < 16);
sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
.MOV(sources[length],
- fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+ fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
length++;
}
bld.exec_all().annotate("FB write oMask")
.MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
- inst->group),
+ inst->group % 16),
sample_mask);
length++;
}
if (src_stencil.file != BAD_FILE) {
assert(devinfo->gen >= 9);
- assert(bld.dispatch_width() != 16);
+ assert(bld.dispatch_width() == 8);
/* XXX: src_stencil is only available on gen9+. dst_depth is never
* available on gen9+. As such it's impossible to have both enabled at the
static void
lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
{
- const fs_builder &ubld = bld.exec_all();
+ const fs_builder &ubld = bld.exec_all().group(8, 0);
const unsigned length = 2;
- const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length);
+ const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
- ubld.group(16, 0)
- .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ if (bld.group() < 16) {
+ ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+ BRW_REGISTER_TYPE_UD));
+ } else {
+ assert(bld.group() < 32);
+ const fs_reg header_sources[] = {
+ retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+ retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
+ };
+ ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
+ }
inst->resize_sources(1);
inst->src[0] = header;
* after \p inst, inst->next is a moving target and we need to save
* it off here so that we insert the zip instructions in the right
* place.
+ *
+ * Since we're inserting split instructions after after_inst, the
+ * instructions will end up in the reverse order that we insert them.
+ * However, certain render target writes require that the low group
+ * instructions come before the high group. From the Ivy Bridge PRM
+ * Vol. 4, Pt. 1, Section 3.9.11:
+ *
+ * "If multiple SIMD8 Dual Source messages are delivered by the
+ * pixel shader thread, each SIMD8_DUALSRC_LO message must be
+ * issued before the SIMD8_DUALSRC_HI message with the same Slot
+ * Group Select setting."
+ *
+ * And, from Section 3.9.11.1 of the same PRM:
+ *
+ * "When SIMD32 or SIMD16 PS threads send render target writes
+ * with multiple SIMD8 and SIMD16 messages, the following must
+ * hold:
+ *
+ * All the slots (as described above) must have a corresponding
+ * render target write irrespective of the slot's validity. A slot
+ * is considered valid when at least one sample is enabled. For
+ * example, a SIMD16 PS thread must send two SIMD8 render target
+ * writes to cover all the slots.
+ *
+ * PS thread must send SIMD render target write messages with
+ * increasing slot numbers. For example, SIMD16 thread has
+ * Slot[15:0] and if two SIMD8 render target writes are used, the
+ * first SIMD8 render target write must send Slot[7:0] and the
+ * next one must send Slot[15:8]."
+ *
+ * In order to make low group instructions come before high group
+ * instructions (this is required for some render target writes), we
+ * split from the highest group to lowest.
*/
exec_node *const after_inst = inst->next;
- for (unsigned i = 0; i < n; i++) {
+ for (int i = n - 1; i >= 0; i--) {
/* Emit a copy of the original instruction with the lowered width.
* If the EOT flag was set throw it away except for the last
* instruction to avoid killing the thread prematurely.
*/
fs_inst split_inst = *inst;
split_inst.exec_size = lower_width;
- split_inst.eot = inst->eot && i == 0;
+ split_inst.eot = inst->eot && i == n - 1;
/* Select the correct channel enables for the i-th group, then
* transform the sources and destination and emit the lowered
*/
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
if (prog_data->barycentric_interp_modes & (1 << i)) {
- payload.barycentric_coord_reg[i] = payload.num_regs;
+ payload.barycentric_coord_reg[i][0] = payload.num_regs;
payload.num_regs += 2;
if (dispatch_width == 16) {
payload.num_regs += 2;
prog_data->uses_src_depth =
(nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
if (prog_data->uses_src_depth) {
- payload.source_depth_reg = payload.num_regs;
+ payload.source_depth_reg[0] = payload.num_regs;
payload.num_regs++;
if (dispatch_width == 16) {
/* R28: interpolated depth if not SIMD8. */
prog_data->uses_src_w =
(nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
if (prog_data->uses_src_w) {
- payload.source_w_reg = payload.num_regs;
+ payload.source_w_reg[0] = payload.num_regs;
payload.num_regs++;
if (dispatch_width == 16) {
/* R30: interpolated W if not SIMD8. */
* persample dispatch, we hard-code it to 0.5.
*/
prog_data->uses_pos_offset = true;
- payload.sample_pos_reg = payload.num_regs;
+ payload.sample_pos_reg[0] = payload.num_regs;
payload.num_regs++;
}
(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
if (prog_data->uses_sample_mask) {
assert(devinfo->gen >= 7);
- payload.sample_mask_in_reg = payload.num_regs;
+ payload.sample_mask_in_reg[0] = payload.num_regs;
payload.num_regs++;
if (dispatch_width == 16) {
/* R33: input coverage mask if not SIMD8. */
const nir_shader *src_shader,
struct gl_program *prog,
int shader_time_index8, int shader_time_index16,
- bool allow_spilling,
+ int shader_time_index32, bool allow_spilling,
bool use_rep_send, struct brw_vue_map *vue_map,
char **error_str)
{