bool
fs_generator::patch_discard_jumps_to_fb_writes()
{
- if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
+ if (this->discard_halt_patches.is_empty())
return false;
int scale = brw_jump_scale(p->devinfo);
- /* There is a somewhat strange undocumented requirement of using
- * HALT, according to the simulator. If some channel has HALTed to
- * a particular UIP, then by the end of the program, every channel
- * must have HALTed to that UIP. Furthermore, the tracking is a
- * stack, so you can't do the final halt of a UIP after starting
- * halting to a new UIP.
- *
- * Symptoms of not emitting this instruction on actual hardware
- * included GPU hangs and sparkly rendering on the piglit discard
- * tests.
- */
- brw_inst *last_halt = gen6_HALT(p);
- brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
- brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+ if (devinfo->gen >= 6) {
+ /* There is a somewhat strange undocumented requirement of using
+ * HALT, according to the simulator. If some channel has HALTed to
+ * a particular UIP, then by the end of the program, every channel
+ * must have HALTed to that UIP. Furthermore, the tracking is a
+ * stack, so you can't do the final halt of a UIP after starting
+ * halting to a new UIP.
+ *
+ * Symptoms of not emitting this instruction on actual hardware
+ * included GPU hangs and sparkly rendering on the piglit discard
+ * tests.
+ */
+ brw_inst *last_halt = brw_HALT(p);
+ brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
+ brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+ }
int ip = p->nr_insn;
brw_inst *patch = &p->store[patch_ip->ip];
assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
- /* HALT takes a half-instruction distance from the pre-incremented IP. */
- brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+ if (devinfo->gen >= 6) {
+ /* HALT takes a half-instruction distance from the pre-incremented IP. */
+ brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+ } else {
+ brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
+ }
}
this->discard_halt_patches.make_empty();
+
+ if (devinfo->gen < 6) {
+ /* From the g965 PRM:
+ *
+ * "As DMask is not automatically reloaded into AMask upon completion
+ * of this instruction, software has to manually restore AMask upon
+ * completion."
+ *
+ * DMask lives in the bottom 16 bits of sr0.1.
+ */
+ brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
+ retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
+ brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
+ brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
+ brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
+ brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
+ }
+
+ if (devinfo->gen == 4 && !devinfo->is_g4x) {
+ /* From the g965 PRM:
+ *
+ * "[DevBW, DevCL] Erratum: The subfields in mask stack register are
+ * reset to zero during graphics reset, however, they are not
+ * initialized at thread dispatch. These subfields will retain the
+ * values from the previous thread. Software should make sure the
+ * mask stack is empty (reset to zero) before terminating the thread.
+ * In case that this is not practical, software may have to reset the
+ * mask stack at the beginning of each kernel, which will impact the
+ * performance."
+ *
+ * Luckily we can rely on:
+ *
+ * "[DevBW, DevCL] This register access restriction is not
+ * applicable, hardware does ensure execution pipeline coherency,
+ * when a mask stack register is used as an explicit source and/or
+ * destination."
+ */
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+ brw_set_default_exec_size(p, BRW_EXECUTE_2);
+ brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
+
+ brw_set_default_exec_size(p, BRW_EXECUTE_16);
+ /* Reset the if stack. */
+ brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
+ brw_imm_uw(0));
+
+ brw_pop_insn_state(p);
+ }
+
return true;
}
reg.nr = imm_byte_offset / REG_SIZE;
reg.subnr = imm_byte_offset % REG_SIZE;
- brw_MOV(p, dst, reg);
+ if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) {
+ brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
+ subscript(reg, BRW_REGISTER_TYPE_D, 0));
+ brw_set_default_swsb(p, tgl_swsb_null());
+ brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
+ subscript(reg, BRW_REGISTER_TYPE_D, 1));
+ } else {
+ brw_MOV(p, dst, reg);
+ }
} else {
/* Prior to Broadwell, there are only 8 address registers. */
assert(inst->exec_size <= 8 || devinfo->gen >= 8);
void
fs_generator::generate_discard_jump(fs_inst *)
{
- assert(devinfo->gen >= 6);
-
/* This HALT will be patched up at FB write time to point UIP at the end of
* the program, and at brw_uip_jip() JIP will be set to the end of the
* current block (or the program).
*/
this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
- gen6_HALT(p);
+ brw_HALT(p);
}
void
struct brw_compile_stats *stats)
{
/* align to 64 byte boundary. */
- while (p->next_insn_offset % 64)
- brw_NOP(p);
+ brw_realign(p, 64);
this->dispatch_width = dispatch_width;
/* overriding the shader makes disasm_info invalid */
if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
- dump_assembly(p->store, disasm_info, perf.block_latency);
+ dump_assembly(p->store, start_offset, p->next_insn_offset,
+ disasm_info, perf.block_latency);
} else {
fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
}
return start_offset;
}
+void
+fs_generator::add_const_data(void *data, unsigned size)
+{
+ assert(prog_data->const_data_size == 0);
+ if (size > 0) {
+ prog_data->const_data_size = size;
+ prog_data->const_data_offset = brw_append_data(p, data, size, 32);
+ }
+}
+
const unsigned *
fs_generator::get_assembly()
{