else {
insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
- /* even ignored in da16, still need to set as '01' */
+ /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
+ * Although Dst.HorzStride is a don't care for Align16, HW needs
+ * this to be programmed as "01".
+ */
insn->bits1.da16.dest_horiz_stride = 1;
}
}
brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
struct brw_reg reg)
{
+ struct brw_context *brw = p->brw;
+ struct intel_context *intel = &brw->intel;
+
if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
assert(reg.nr < 128);
gen7_convert_mrf_to_grf(p, ®);
+ if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
+ insn->header.opcode == BRW_OPCODE_SENDC)) {
+ /* Any source modifiers or regions will be ignored, since this just
+ * identifies the MRF/GRF to start reading the message contents from.
+ * Check for some likely failures.
+ */
+ assert(!reg.negate);
+ assert(!reg.abs);
+ assert(reg.address_mode == BRW_ADDRESS_DIRECT);
+ }
+
validate_reg(insn, reg);
insn->bits1.da1.src0_reg_file = reg.file;
{
assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
- assert(reg.nr < 128);
+ if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+ assert(reg.nr < 128);
gen7_convert_mrf_to_grf(p, ®);
GLuint msg_type,
GLuint target_cache,
GLuint msg_length,
+ bool header_present,
GLuint response_length)
{
struct brw_context *brw = p->brw;
}
brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
- true, false);
+ header_present, false);
if (intel->gen >= 7) {
insn->bits3.gen7_dp.binding_table_index = binding_table_index;
struct brw_reg src1,
struct brw_reg src2)
{
+ struct intel_context *intel = &p->brw->intel;
struct brw_instruction *insn = next_insn(p, opcode);
gen7_convert_mrf_to_grf(p, &dest);
dest.file == BRW_MESSAGE_REGISTER_FILE);
assert(dest.nr < 128);
assert(dest.address_mode == BRW_ADDRESS_DIRECT);
- assert(dest.type = BRW_REGISTER_TYPE_F);
+ assert(dest.type == BRW_REGISTER_TYPE_F ||
+ dest.type == BRW_REGISTER_TYPE_D ||
+ dest.type == BRW_REGISTER_TYPE_UD);
insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
insn->bits1.da3src.dest_reg_nr = dest.nr;
insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
assert(src0.file == BRW_GENERAL_REGISTER_FILE);
assert(src0.address_mode == BRW_ADDRESS_DIRECT);
assert(src0.nr < 128);
- assert(src0.type == BRW_REGISTER_TYPE_F);
insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
insn->bits2.da3src.src0_reg_nr = src0.nr;
assert(src1.file == BRW_GENERAL_REGISTER_FILE);
assert(src1.address_mode == BRW_ADDRESS_DIRECT);
assert(src1.nr < 128);
- assert(src1.type == BRW_REGISTER_TYPE_F);
insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
assert(src2.file == BRW_GENERAL_REGISTER_FILE);
assert(src2.address_mode == BRW_ADDRESS_DIRECT);
assert(src2.nr < 128);
- assert(src2.type == BRW_REGISTER_TYPE_F);
insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
insn->bits1.da3src.src2_abs = src2.abs;
insn->bits1.da3src.src2_negate = src2.negate;
+ if (intel->gen >= 7) {
+ /* Set both the source and destination types based on dest.type,
+ * ignoring the source register types. The MAD and LRP emitters ensure
+ * that all four types are float. The BFE and BFI2 emitters, however,
+ * may send us mixed D and UD types and want us to ignore that and use
+ * the destination type.
+ */
+ switch (dest.type) {
+ case BRW_REGISTER_TYPE_F:
+ insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
+ insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
+ break;
+ case BRW_REGISTER_TYPE_D:
+ insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
+ insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
+ insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
+ break;
+ }
+ }
+
return insn;
}
return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
}
+#define ALU3F(OP) \
+struct brw_instruction *brw_##OP(struct brw_compile *p, \
+ struct brw_reg dest, \
+ struct brw_reg src0, \
+ struct brw_reg src1, \
+ struct brw_reg src2) \
+{ \
+ assert(dest.type == BRW_REGISTER_TYPE_F); \
+ assert(src0.type == BRW_REGISTER_TYPE_F); \
+ assert(src1.type == BRW_REGISTER_TYPE_F); \
+ assert(src2.type == BRW_REGISTER_TYPE_F); \
+ return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
+}
+
/* Rounding operations (other than RNDD) require two instructions - the first
* stores a rounded value (possibly the wrong way) in the dest register, but
* also sets a per-channel "increment bit" in the flag register. A predicated
ALU2(RSR)
ALU2(RSL)
ALU2(ASR)
+ALU1(F32TO16)
+ALU1(F16TO32)
ALU1(FRC)
ALU1(RNDD)
ALU2(MAC)
ALU2(DP2)
ALU2(LINE)
ALU2(PLN)
-ALU3(MAD)
+ALU3F(MAD)
+ALU3F(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
ROUND(RNDZ)
ROUND(RNDE)
return insn;
}
+struct brw_instruction *gen6_HALT(struct brw_compile *p)
+{
+ struct brw_instruction *insn;
+
+ insn = next_insn(p, BRW_OPCODE_HALT);
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+
+ if (p->compressed) {
+ insn->header.execution_size = BRW_EXECUTE_16;
+ } else {
+ insn->header.compression_control = BRW_COMPRESSION_NONE;
+ insn->header.execution_size = BRW_EXECUTE_8;
+ }
+ return insn;
+}
+
/* DO/WHILE loop:
*
* The DO/WHILE is just an unterminated loop -- break or continue are
struct brw_reg src0,
struct brw_reg src1)
{
+ struct intel_context *intel = &p->brw->intel;
struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
insn->header.destreg__conditionalmod = conditional;
p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
p->flag_value = 0xff;
}
+
+ /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
+ * page says:
+ * "Any CMP instruction with a null destination must use a {switch}."
+ *
+ * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
+ * mentioned on their work-arounds pages.
+ */
+ if (intel->gen == 7) {
+ if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+ dest.nr == BRW_ARF_NULL) {
+ insn->header.thread_control = BRW_THREAD_SWITCH;
+ }
+ }
}
/* Issue 'wait' instruction for n1, host could program MMIO
if (intel->gen >= 6) {
struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
- assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+ assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+ (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
assert(src.file == BRW_GENERAL_REGISTER_FILE);
assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
(void) intel;
- assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+ assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+ (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
assert(src0.file == BRW_GENERAL_REGISTER_FILE);
assert(src1.file == BRW_GENERAL_REGISTER_FILE);
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1, /* msg_length */
+ true, /* header_present */
rlen);
}
}
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1, /* msg_length */
+ true, /* header_present */
1); /* response_length (1 reg, 2 owords!) */
brw_pop_insn_state(p);
struct brw_reg src0,
GLuint binding_table_index,
GLuint sampler,
- GLuint writemask,
GLuint msg_type,
GLuint response_length,
GLuint msg_length,
GLuint return_format)
{
struct intel_context *intel = &p->brw->intel;
- bool need_stall = 0;
-
- if (writemask == 0) {
- /*printf("%s: zero writemask??\n", __FUNCTION__); */
- return;
- }
-
- /* Hardware doesn't do destination dependency checking on send
- * instructions properly. Add a workaround which generates the
- * dependency by other means. In practice it seems like this bug
- * only crops up for texture samples, and only where registers are
- * written by the send and then written again later without being
- * read in between. Luckily for us, we already track that
- * information and use it to modify the writemask for the
- * instruction, so that is a guide for whether a workaround is
- * needed.
- */
- if (writemask != WRITEMASK_XYZW) {
- GLuint dst_offset = 0;
- GLuint i, newmask = 0, len = 0;
-
- for (i = 0; i < 4; i++) {
- if (writemask & (1<<i))
- break;
- dst_offset += 2;
- }
- for (; i < 4; i++) {
- if (!(writemask & (1<<i)))
- break;
- newmask |= 1<<i;
- len++;
- }
-
- if (newmask != writemask) {
- need_stall = 1;
- /* printf("need stall %x %x\n", newmask , writemask); */
- }
- else {
- bool dispatch_16 = false;
-
- struct brw_reg m1 = brw_message_reg(msg_reg_nr);
-
- guess_execution_size(p, p->current, dest);
- if (p->current->header.execution_size == BRW_EXECUTE_16)
- dispatch_16 = true;
-
- newmask = ~newmask & WRITEMASK_XYZW;
-
- brw_push_insn_state(p);
-
- brw_set_compression_control(p, BRW_COMPRESSION_NONE);
- brw_set_mask_control(p, BRW_MASK_DISABLE);
-
- brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
- retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
- brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
-
- brw_pop_insn_state(p);
-
- src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
- dest = offset(dest, dst_offset);
-
- /* For 16-wide dispatch, masked channels are skipped in the
- * response. For 8-wide, masked channels still take up slots,
- * and are just not written to.
- */
- if (dispatch_16)
- response_length = len * 2;
- }
- }
-
- {
- struct brw_instruction *insn;
-
- gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+ struct brw_instruction *insn;
- insn = next_insn(p, BRW_OPCODE_SEND);
- insn->header.predicate_control = 0; /* XXX */
- insn->header.compression_control = BRW_COMPRESSION_NONE;
- if (intel->gen < 6)
- insn->header.destreg__conditionalmod = msg_reg_nr;
+ gen6_resolve_implied_move(p, &src0, msg_reg_nr);
- brw_set_dest(p, insn, dest);
- brw_set_src0(p, insn, src0);
- brw_set_sampler_message(p, insn,
- binding_table_index,
- sampler,
- msg_type,
- response_length,
- msg_length,
- header_present,
- simd_mode,
- return_format);
- }
-
- if (need_stall) {
- struct brw_reg reg = vec8(offset(dest, response_length-1));
-
- /* mov (8) r9.0<1>:f r9.0<8;8,1>:f { Align1 }
- */
- brw_push_insn_state(p);
- brw_set_compression_control(p, BRW_COMPRESSION_NONE);
- brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
- retype(reg, BRW_REGISTER_TYPE_UD));
- brw_pop_insn_state(p);
- }
+ insn = next_insn(p, BRW_OPCODE_SEND);
+ insn->header.predicate_control = 0; /* XXX */
+ insn->header.compression_control = BRW_COMPRESSION_NONE;
+ if (intel->gen < 6)
+ insn->header.destreg__conditionalmod = msg_reg_nr;
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_sampler_message(p, insn,
+ binding_table_index,
+ sampler,
+ msg_type,
+ response_length,
+ msg_length,
+ header_present,
+ simd_mode,
+ return_format);
}
/* All these variables are pretty confusing - we might be better off
/* Enable Channel Masks in the URB_WRITE_HWORD message header */
brw_push_insn_state(p);
brw_set_access_mode(p, BRW_ALIGN_1);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
BRW_REGISTER_TYPE_UD),
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
case BRW_OPCODE_ENDIF:
case BRW_OPCODE_ELSE:
case BRW_OPCODE_WHILE:
+ case BRW_OPCODE_HALT:
return ip;
}
}
- assert(!"not reached");
- return start + 1;
+
+ return 0;
}
/* There is no DO instruction on gen6, so to find the end of the loop
}
/* After program generation, go back and update the UIP and JIP of
- * BREAK and CONT instructions to their correct locations.
+ * BREAK, CONT, and HALT instructions to their correct locations.
*/
void
brw_set_uip_jip(struct brw_compile *p)
continue;
}
+ int block_end_ip = brw_find_next_block_end(p, ip);
switch (insn->header.opcode) {
case BRW_OPCODE_BREAK:
- insn->bits3.break_cont.jip =
- (brw_find_next_block_end(p, ip) - ip) / scale;
+ assert(block_end_ip != 0);
+ insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
/* Gen7 UIP points to WHILE; Gen6 points just after it */
insn->bits3.break_cont.uip =
(brw_find_loop_end(p, ip) - ip +
(intel->gen == 6 ? 16 : 0)) / scale;
break;
case BRW_OPCODE_CONTINUE:
- insn->bits3.break_cont.jip =
- (brw_find_next_block_end(p, ip) - ip) / scale;
+ assert(block_end_ip != 0);
+ insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
insn->bits3.break_cont.uip =
(brw_find_loop_end(p, ip) - ip) / scale;
assert(insn->bits3.break_cont.uip != 0);
assert(insn->bits3.break_cont.jip != 0);
break;
+
+ case BRW_OPCODE_ENDIF:
+ if (block_end_ip == 0)
+ insn->bits3.break_cont.jip = 2;
+ else
+ insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
+ break;
+
+ case BRW_OPCODE_HALT:
+ /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+ *
+ * "In case of the halt instruction not inside any conditional
+ * code block, the value of <JIP> and <UIP> should be the
+ * same. In case of the halt instruction inside conditional code
+ * block, the <UIP> should be the end of the program, and the
+ * <JIP> should be end of the most inner conditional code block."
+ *
+ * The uip will have already been set by whoever set up the
+ * instruction.
+ */
+ if (block_end_ip == 0) {
+ insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
+ } else {
+ insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
+ }
+ assert(insn->bits3.break_cont.uip != 0);
+ assert(insn->bits3.break_cont.jip != 0);
+ break;
}
}
}
0, /* end_of_thread */
send_commit_msg); /* send_commit_msg */
}
+
+/**
+ * This instruction is generated as a single-channel align1 instruction by
+ * both the VS and FS stages when using INTEL_DEBUG=shader_time.
+ *
+ * We can't use the typed atomic op in the FS because that has the execution
+ * mask ANDed with the pixel mask, but we just want to write the one dword for
+ * all the pixels.
+ *
+ * We don't use the SIMD4x2 atomic ops in the VS because want to just write
+ * one u32. So we use the same untyped atomic write message as the pixel
+ * shader.
+ *
+ * The untyped atomic operation requires a BUFFER surface type with RAW
+ * format, and is only accessible through the legacy DATA_CACHE dataport
+ * messages.
+ */
+void brw_shader_time_add(struct brw_compile *p,
+ struct brw_reg payload,
+ uint32_t surf_index)
+{
+ struct intel_context *intel = &p->brw->intel;
+ assert(intel->gen >= 7);
+
+ brw_push_insn_state(p);
+ brw_set_access_mode(p, BRW_ALIGN_1);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_pop_insn_state(p);
+
+ /* We use brw_vec1_reg and unmasked because we want to increment the given
+ * offset only once.
+ */
+ brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_NULL, 0));
+ brw_set_src0(p, send, brw_vec1_reg(payload.file,
+ payload.nr, 0));
+
+ uint32_t sfid, msg_type;
+ if (intel->is_haswell) {
+ sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
+ msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
+ } else {
+ sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+ msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
+ }
+
+ bool header_present = false;
+ bool eot = false;
+ uint32_t mlen = 2; /* offset, value */
+ uint32_t rlen = 0;
+ brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
+
+ send->bits3.ud |= msg_type << 14;
+ send->bits3.ud |= 0 << 13; /* no return data */
+ send->bits3.ud |= 1 << 12; /* SIMD8 mode */
+ send->bits3.ud |= BRW_AOP_ADD << 8;
+ send->bits3.ud |= surf_index << 0;
+}