* instruction -- the FS opcodes often generate MOVs in addition.
*/
int
-fs_visitor::implied_mrf_writes(fs_inst *inst)
+fs_visitor::implied_mrf_writes(fs_inst *inst) const
{
if (inst->mlen == 0)
return 0;
assert(devinfo->gen >= 6);
const fs_builder abld = bld.annotate("compute sample id");
- fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
if (!key->multisample_fbo) {
/* As per GL_ARB_sample_shading specification:
* TODO: These payload bits exist on Gen7 too, but they appear to always
* be zero, so this code fails to work. We should find out why.
*/
- fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+ fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
- BRW_REGISTER_TYPE_B), 1, 8, 0)),
+ BRW_REGISTER_TYPE_UB), 1, 8, 0)),
brw_imm_v(0x44440000));
abld.AND(*reg, tmp, brw_imm_w(0xf));
} else {
const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
- BRW_REGISTER_TYPE_D), 0);
- const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+ BRW_REGISTER_TYPE_UD), 0);
+ const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
/* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
* 8x multisampling, subspan 0 will represent sample N (where N
* accomodate 16x MSAA.
*/
abld.exec_all().group(1, 0)
- .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
+ .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
brw_imm_ud(0xc0));
abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
if (subgroup_id_index >= 0)
max_push_components--; /* Save a slot for the thread ID */
+ /* FIXME: We currently have some GPU hangs that happen apparently when using
+ * push constants. Since we have no solution for such hangs yet, just
+ * go ahead and use pull constants for now.
+ */
+ if (devinfo->gen == 10 && compiler->supports_pull_constants) {
+ compiler->shader_perf_log(log_data, "Disabling push constants.");
+ max_push_components = 0;
+ }
+
/* We push small arrays, but no bigger than 16 floats. This is big enough
* for a vec4 but hopefully not large enough to push out other stuff. We
* should probably use a better heuristic at some point.
unsigned push_start_align = cplx_align_apply(align, num_push_constants);
unsigned chunk_size = u - chunk_start + 1;
- if (!compiler->supports_pull_constants ||
+ if ((!compiler->supports_pull_constants && u < UBO_START) ||
(chunk_size < max_chunk_size &&
push_start_align + chunk_size <= max_push_components)) {
/* Align up the number of push constants */
regions_overlap(inst->dst, inst->size_written,
inst->src[1], inst->size_read(1))) {
needs_mov = true;
- low.nr = alloc.allocate(regs_written(inst));
- low.offset = low.offset % REG_SIZE;
+ /* Get a new VGRF but keep the same stride as inst->dst */
+ low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
+ inst->dst.type);
+ low.stride = inst->dst.stride;
+ low.offset = inst->dst.offset % REG_SIZE;
}
- fs_reg high = inst->dst;
- high.nr = alloc.allocate(regs_written(inst));
- high.offset = high.offset % REG_SIZE;
+ /* Get a new VGRF but keep the same stride as inst->dst */
+ fs_reg high(VGRF, alloc.allocate(regs_written(inst)),
+ inst->dst.type);
+ high.stride = inst->dst.stride;
+ high.offset = inst->dst.offset % REG_SIZE;
if (devinfo->gen >= 7) {
if (inst->src[1].file == IMM) {
return MIN2(8, inst->exec_size);
case FS_OPCODE_LINTERP:
- case FS_OPCODE_GET_BUFFER_SIZE:
+ case SHADER_OPCODE_GET_BUFFER_SIZE:
case FS_OPCODE_DDX_COARSE:
case FS_OPCODE_DDX_FINE:
case FS_OPCODE_DDY_COARSE: