FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
+ FS_OPCODE_DISCARD_JUMP,
VS_OPCODE_URB_WRITE,
VS_OPCODE_SCRATCH_READ,
struct brw_instruction *brw_BREAK(struct brw_compile *p);
struct brw_instruction *brw_CONT(struct brw_compile *p);
struct brw_instruction *gen6_CONT(struct brw_compile *p);
+struct brw_instruction *gen6_HALT(struct brw_compile *p);
/* Forward jumps:
*/
void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx);
return insn;
}
+struct brw_instruction *gen6_HALT(struct brw_compile *p)
+{
+ struct brw_instruction *insn;
+
+ insn = next_insn(p, BRW_OPCODE_HALT);
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+
+ if (p->compressed) {
+ insn->header.execution_size = BRW_EXECUTE_16;
+ } else {
+ insn->header.compression_control = BRW_COMPRESSION_NONE;
+ insn->header.execution_size = BRW_EXECUTE_8;
+ }
+ return insn;
+}
+
/* DO/WHILE loop:
*
* The DO/WHILE is just an unterminated loop -- break or continue are
return ip;
}
}
- assert(!"not reached");
- return start + 1;
+
+ return 0;
}
/* There is no DO instruction on gen6, so to find the end of the loop
}
/* After program generation, go back and update the UIP and JIP of
- * BREAK and CONT instructions to their correct locations.
+ * BREAK, CONT, and HALT instructions to their correct locations.
*/
void
brw_set_uip_jip(struct brw_compile *p)
continue;
}
+ int block_end_ip = brw_find_next_block_end(p, ip);
switch (insn->header.opcode) {
case BRW_OPCODE_BREAK:
- insn->bits3.break_cont.jip =
- (brw_find_next_block_end(p, ip) - ip) / scale;
+ assert(block_end_ip != 0);
+ insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
/* Gen7 UIP points to WHILE; Gen6 points just after it */
insn->bits3.break_cont.uip =
(brw_find_loop_end(p, ip) - ip +
(intel->gen == 6 ? 16 : 0)) / scale;
break;
case BRW_OPCODE_CONTINUE:
- insn->bits3.break_cont.jip =
- (brw_find_next_block_end(p, ip) - ip) / scale;
+ assert(block_end_ip != 0);
+ insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
insn->bits3.break_cont.uip =
(brw_find_loop_end(p, ip) - ip) / scale;
assert(insn->bits3.break_cont.uip != 0);
assert(insn->bits3.break_cont.jip != 0);
break;
+ case BRW_OPCODE_HALT:
+ /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+ *
+ * "In case of the halt instruction not inside any conditional
+ * code block, the value of <JIP> and <UIP> should be the
+ * same. In case of the halt instruction inside conditional code
+ * block, the <UIP> should be the end of the program, and the
+ * <JIP> should be end of the most inner conditional code block."
+ *
+ * The uip will have already been set by whoever set up the
+ * instruction.
+ */
+ if (block_end_ip == 0) {
+ insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
+ } else {
+ insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
+ }
+ assert(insn->bits3.break_cont.uip != 0);
+ assert(insn->bits3.break_cont.jip != 0);
+ break;
}
}
}
static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F);
static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D);
+class ip_record : public exec_node {
+public:
+ static void* operator new(size_t size, void *ctx)
+ {
+ void *node;
+
+ node = rzalloc_size(ctx, size);
+ assert(node != NULL);
+
+ return node;
+ }
+
+ ip_record(int ip)
+ {
+ this->ip = ip;
+ }
+
+ int ip;
+};
+
class fs_inst : public backend_instruction {
public:
/* Callers of this ralloc-based new need not call delete. It's
struct brw_reg index,
struct brw_reg offset);
void generate_mov_dispatch_to_flags(fs_inst *inst);
+ void generate_discard_jump(fs_inst *inst);
+
+ void patch_discard_jumps_to_fb_writes();
struct brw_context *brw;
struct intel_context *intel;
unsigned dispatch_width; /**< 8 or 16 */
+ exec_list discard_halt_patches;
bool dual_source_output;
void *mem_ctx;
};
{
}
+void
+fs_generator::patch_discard_jumps_to_fb_writes()
+{
+ if (intel->gen < 6 || this->discard_halt_patches.is_empty())
+ return;
+
+ /* There is a somewhat strange undocumented requirement of using
+ * HALT, according to the simulator. If some channel has HALTed to
+ * a particular UIP, then by the end of the program, every channel
+ * must have HALTed to that UIP. Furthermore, the tracking is a
+ * stack, so you can't do the final halt of a UIP after starting
+ * halting to a new UIP.
+ *
+ * Symptoms of not emitting this instruction on actual hardware
+ * included GPU hangs and sparkly rendering on the piglit discard
+ * tests.
+ */
+ struct brw_instruction *last_halt = gen6_HALT(p);
+ last_halt->bits3.break_cont.uip = 2;
+ last_halt->bits3.break_cont.jip = 2;
+
+ int ip = p->nr_insn;
+
+ foreach_list(node, &this->discard_halt_patches) {
+ ip_record *patch_ip = (ip_record *)node;
+ struct brw_instruction *patch = &p->store[patch_ip->ip];
+
+ assert(patch->header.opcode == BRW_OPCODE_HALT);
+ /* HALT takes a half-instruction distance from the pre-incremented IP. */
+ patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2;
+ }
+
+ this->discard_halt_patches.make_empty();
+}
+
void
fs_generator::generate_fb_write(fs_inst *inst)
{
struct brw_reg implied_header;
uint32_t msg_control;
+ /* Note that the jumps emitted to this point mean that the g0 ->
+ * base_mrf setup must be inside of this function, so that we jump
+ * to a point containing it.
+ */
+ patch_discard_jumps_to_fb_writes();
+
/* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
* move, here's g1.
*/
brw_ADD(p, dst, src0, negate(src1));
}
+void
+fs_generator::generate_discard_jump(fs_inst *inst)
+{
+ assert(intel->gen >= 6);
+
+ /* This HALT will be patched up at FB write time to point UIP at the end of
+ * the program, and at brw_uip_jip() JIP will be set to the end of the
+ * current block (or the program).
+ */
+ this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
+
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ gen6_HALT(p);
+ brw_pop_insn_state(p);
+}
+
void
fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
{
generate_mov_dispatch_to_flags(inst);
break;
+ case FS_OPCODE_DISCARD_JUMP:
+ generate_discard_jump(inst);
+ break;
+
case SHADER_OPCODE_SHADER_TIME_ADD:
brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
break;
BRW_CONDITIONAL_NZ));
cmp->predicate = BRW_PREDICATE_NORMAL;
cmp->flag_subreg = 1;
+
+ if (intel->gen >= 6) {
+ /* For performance, after a discard, jump to the end of the shader.
+ * However, many people will do foliage by discarding based on a
+ * texture's alpha mask, and then continue on to texture with the
+ * remaining pixels. To avoid trashing the derivatives for those
+ * texture samples, we'll only jump if all of the pixels in the subspan
+ * have been discarded.
+ */
+ fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
+ discard_jump->flag_subreg = 1;
+ discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
+ discard_jump->predicate_inverse = true;
+ }
}
void