break;
}
+ /* use PCB unless constant buffer 0 is not in user buffer */
+ if ((ilo->cbuf[info->type].enabled_mask & 0x1) &&
+ !ilo->cbuf[info->type].cso[0].user_buffer)
+ variant->use_pcb = false;
+ else
+ variant->use_pcb = true;
+
num_views = ilo->view[info->type].count;
assert(info->num_samplers <= num_views);
break;
}
+ variant->use_pcb = true;
+
variant->num_sampler_views = info->num_samplers;
for (i = 0; i < info->num_samplers; i++) {
if (info->shadow_samplers & (1 << i)) {
/* states used in ilo_shader_variant_init() */
shader->info.non_orthogonal_states = ILO_DIRTY_VIEW_VS |
- ILO_DIRTY_RASTERIZER;
+ ILO_DIRTY_RASTERIZER |
+ ILO_DIRTY_CBUF;
return shader;
}
/* states used in ilo_shader_variant_init() */
shader->info.non_orthogonal_states = ILO_DIRTY_VIEW_GS |
ILO_DIRTY_VS |
- ILO_DIRTY_RASTERIZER;
+ ILO_DIRTY_RASTERIZER |
+ ILO_DIRTY_CBUF;
return shader;
}
/* states used in ilo_shader_variant_init() */
shader->info.non_orthogonal_states = ILO_DIRTY_VIEW_FS |
ILO_DIRTY_RASTERIZER |
- ILO_DIRTY_FB;
+ ILO_DIRTY_FB |
+ ILO_DIRTY_CBUF;
return shader;
}
val = kernel->in.start_grf;
break;
case ILO_KERNEL_SKIP_CBUF0_UPLOAD:
- val = false;
+ val = kernel->skip_cbuf0_upload;
break;
case ILO_KERNEL_PCB_CBUF0_SIZE:
- val = 0;
+ val = kernel->pcb.cbuf0_size;
break;
case ILO_KERNEL_VS_INPUT_INSTANCEID:
}
}
+static bool
+fs_lower_opcode_tgsi_const_pcb(struct fs_compile_context *fcc,
+ struct toy_dst dst, int dim,
+ struct toy_src idx)
+{
+ const int grf = fcc->first_const_grf + idx.val32 / 2;
+ const int grf_subreg = (idx.val32 & 1) * 16;
+ struct toy_src src;
+ struct toy_dst real_dst[4];
+ int i;
+
+ if (!fcc->variant->use_pcb || dim != 0 || idx.file != TOY_FILE_IMM ||
+ grf >= fcc->first_attr_grf)
+ return false;
+
+ src = tsrc_rect(tsrc(TOY_FILE_GRF, grf, grf_subreg), TOY_RECT_010);
+
+ tdst_transpose(dst, real_dst);
+ for (i = 0; i < 4; i++) {
+ /* cast to type D to make sure these are raw moves */
+ tc_MOV(&fcc->tc, tdst_d(real_dst[i]), tsrc_d(tsrc_offset(src, 0, i)));
+ }
+
+ return true;
+}
+
static void
fs_lower_opcode_tgsi_const_gen6(struct fs_compile_context *fcc,
struct toy_dst dst, int dim, struct toy_src idx)
struct toy_dst tmp, real_dst[4];
int i;
+ if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
+ return;
+
/* set message header */
inst = tc_MOV(tc, header, r0);
inst->mask_ctrl = BRW_MASK_DISABLE;
struct toy_dst tmp, real_dst[4];
int i;
+ if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
+ return;
+
/*
* In 4c1fdae0a01b3f92ec03b61aac1d3df500d51fc6, pull constant load was
* changed from OWord Block Read to ld to increase performance in the
fs_setup_shader_in(fcc->shader, &fcc->tgsi, fcc->variant->u.fs.flatshade);
fs_setup_shader_out(fcc->shader, &fcc->tgsi);
- /* we do not make use of push constant buffers yet */
- num_consts = 0;
+ if (fcc->variant->use_pcb && !fcc->tgsi.const_indirect) {
+ num_consts = (fcc->tgsi.const_count + 1) / 2;
+
+ /*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 287:
+ *
+ * "The sum of all four read length fields (each incremented to
+ * represent the actual read length) must be less than or equal to
+ * 64"
+ *
+ * Since we are usually under a high register pressure, do not allow
+ * for more than 8.
+ */
+ if (num_consts > 8)
+ num_consts = 0;
+ }
+ else {
+ num_consts = 0;
+ }
+
+ fcc->shader->skip_cbuf0_upload = (!fcc->tgsi.const_count || num_consts);
+ fcc->shader->pcb.cbuf0_size = num_consts * (sizeof(float) * 8);
fcc->first_const_grf = fs_setup_payloads(fcc);
fcc->first_attr_grf = fcc->first_const_grf + num_consts;
} fs;
} u;
+ bool use_pcb;
+
int num_sampler_views;
struct {
unsigned r:3;
bool has_pos;
} out;
+ bool skip_cbuf0_upload;
+
bool has_kill;
bool dispatch_16;
/* what does the push constant buffer consist of? */
struct {
+ int cbuf0_size;
int clip_state_size;
} pcb;
int num_grf_per_vrf;
int first_const_grf;
+ int first_ucp_grf;
int first_vue_grf;
int first_free_grf;
int last_free_grf;
}
}
+static bool
+vs_lower_opcode_tgsi_const_pcb(struct vs_compile_context *vcc,
+ struct toy_dst dst, int dim,
+ struct toy_src idx)
+{
+ const int i = idx.val32;
+ const int grf = vcc->first_const_grf + i / 2;
+ const int grf_subreg = (i & 1) * 16;
+ struct toy_src src;
+
+ if (!vcc->variant->use_pcb || dim != 0 || idx.file != TOY_FILE_IMM ||
+ grf >= vcc->first_ucp_grf)
+ return false;
+
+
+ src = tsrc_rect(tsrc(TOY_FILE_GRF, grf, grf_subreg), TOY_RECT_041);
+ tc_MOV(&vcc->tc, dst, src);
+
+ return true;
+}
+
static void
vs_lower_opcode_tgsi_const_gen6(struct vs_compile_context *vcc,
struct toy_dst dst, int dim,
struct toy_inst *inst;
struct toy_src desc;
+ if (vs_lower_opcode_tgsi_const_pcb(vcc, dst, dim, idx))
+ return;
+
/* set message header */
inst = tc_MOV(tc, header, r0);
inst->mask_ctrl = BRW_MASK_DISABLE;
tdst_ud(tdst(TOY_FILE_MRF, vcc->first_free_mrf, 0));
struct toy_src desc;
+ if (vs_lower_opcode_tgsi_const_pcb(vcc, dst, dim, idx))
+ return;
+
/*
* In 259b65e2e7938de4aab323033cfe2b33369ddb07, pull constant load was
* changed from OWord Dual Block Read to ld to increase performance in the
}
for (j = first_ucp; j <= last_ucp; j++) {
- const int plane_grf = vcc->first_const_grf + j / 2;
+ const int plane_grf = vcc->first_ucp_grf + j / 2;
const int plane_subreg = (j & 1) * 16;
const struct toy_src plane = tsrc_rect(tsrc(TOY_FILE_GRF,
plane_grf, plane_subreg), TOY_RECT_041);
vs_setup_shader_out(vcc->shader, &vcc->tgsi,
(vcc->variant->u.vs.num_ucps > 0), vcc->output_map);
- /* fit each pair of user clip planes into a register */
- num_consts = (vcc->variant->u.vs.num_ucps + 1) / 2;
+ if (vcc->variant->use_pcb && !vcc->tgsi.const_indirect) {
+ num_consts = (vcc->tgsi.const_count + 1) / 2;
+
+ /*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 138:
+ *
+ * "The sum of all four read length fields (each incremented to
+ * represent the actual read length) must be less than or equal to
+ * 32"
+ */
+ if (num_consts > 32)
+ num_consts = 0;
+ }
+ else {
+ num_consts = 0;
+ }
+
+ vcc->shader->skip_cbuf0_upload = (!vcc->tgsi.const_count || num_consts);
+ vcc->shader->pcb.cbuf0_size = num_consts * (sizeof(float) * 8);
/* r0 is reserved for payload header */
vcc->first_const_grf = 1;
- vcc->first_vue_grf = vcc->first_const_grf + num_consts;
+ vcc->first_ucp_grf = vcc->first_const_grf + num_consts;
+
+ /* fit each pair of user clip planes into a register */
+ vcc->first_vue_grf = vcc->first_ucp_grf +
+ (vcc->variant->u.vs.num_ucps + 1) / 2;
+
vcc->first_free_grf = vcc->first_vue_grf + vcc->shader->in.count;
vcc->last_free_grf = 127;
break;
}
+ for (i = 0; i < tgsi_inst->Instruction.NumSrcRegs; i++) {
+ const struct tgsi_full_src_register *s = &tgsi_inst->Src[i];
+ if (s->Register.File == TGSI_FILE_CONSTANT && s->Register.Indirect)
+ tgsi->const_indirect = true;
+ }
+
/* remember channels written */
for (i = 0; i < tgsi_inst->Instruction.NumDstRegs; i++) {
const struct tgsi_full_dst_register *d = &tgsi_inst->Dst[i];
/* immediates should be declared with TGSI_TOKEN_TYPE_IMMEDIATE */
assert(!"unexpected immediate declaration");
break;
- case TGSI_FILE_NULL:
case TGSI_FILE_CONSTANT:
+ if (tgsi->const_count <= decl->Range.Last)
+ tgsi->const_count = decl->Range.Last + 1;
+ break;
+ case TGSI_FILE_NULL:
case TGSI_FILE_TEMPORARY:
case TGSI_FILE_SAMPLER:
case TGSI_FILE_PREDICATE:
} system_values[8];
int num_system_values;
+ int const_count;
+ bool const_indirect;
+
bool uses_kill;
};