prog_data->local_size[0] = cp->LocalSize[0];
prog_data->local_size[1] = cp->LocalSize[1];
prog_data->local_size[2] = cp->LocalSize[2];
- int local_workgroup_size =
+ unsigned local_workgroup_size =
cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
cfg_t *cfg = NULL;
const char *fail_msg = NULL;
+ int st_index = -1;
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ st_index = brw_get_shader_time_index(brw, prog, &cp->Base, ST_CS);
+
/* Now the main event: Visit the shader IR and generate our CS IR for it.
*/
- fs_visitor v8(brw, mem_ctx, key, prog_data, prog, cp, 8);
+ fs_visitor v8(brw->intelScreen->compiler, brw,
+ mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
+ &cp->Base, 8, st_index);
if (!v8.run_cs()) {
fail_msg = v8.fail_msg;
} else if (local_workgroup_size <= 8 * brw->max_cs_threads) {
prog_data->simd_size = 8;
}
- fs_visitor v16(brw, mem_ctx, key, prog_data, prog, cp, 16);
+ fs_visitor v16(brw->intelScreen->compiler, brw,
+ mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
+ &cp->Base, 16, st_index);
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
!fail_msg && !v8.simd16_unsupported &&
local_workgroup_size <= 16 * brw->max_cs_threads) {
return NULL;
}
- fs_generator g(brw, mem_ctx, (void*) key, &prog_data->base, &cp->Base,
+ fs_generator g(brw->intelScreen->compiler, brw,
+ mem_ctx, (void*) key, &prog_data->base, &cp->Base,
v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
if (INTEL_DEBUG & DEBUG_CS) {
char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
static void
brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key)
{
- struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_COMPUTE_PROGRAM */
const struct brw_compute_program *cp =
(struct brw_compute_program *) brw->compute_program;
- const struct gl_program *prog = (struct gl_program *) cp;
memset(key, 0, sizeof(*key));
}
+static unsigned
+get_cs_thread_count(const struct brw_cs_prog_data *cs_prog_data)
+{
+ const unsigned simd_size = cs_prog_data->simd_size;
+ unsigned group_size = cs_prog_data->local_size[0] *
+ cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
+
+ return (group_size + simd_size - 1) / simd_size;
+}
+
+
static void
brw_upload_cs_state(struct brw_context *brw)
{
prog_data->binding_table.size_bytes,
32, &stage_state->bind_bo_offset);
+ unsigned threads = get_cs_thread_count(cs_prog_data);
+
uint32_t dwords = brw->gen < 8 ? 8 : 9;
BEGIN_BATCH(dwords);
OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));
desc[dw++] = 0;
desc[dw++] = 0;
desc[dw++] = stage_state->bind_bo_offset;
+ desc[dw++] = 0;
+ const uint32_t media_threads =
+ brw->gen >= 8 ?
+ SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
+ SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT);
+ assert(threads <= brw->max_cs_threads);
+ desc[dw++] = media_threads;
BEGIN_BATCH(4);
OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
extern "C"
const struct brw_tracked_state brw_cs_state = {
- .dirty = {
- .mesa = 0,
- .brw = BRW_NEW_CS_PROG_DATA,
+ /* explicit initialisers aren't valid C++, comment
+ * them for documentation purposes */
+ /* .dirty = */{
+ /* .mesa = */ 0,
+ /* .brw = */ BRW_NEW_CS_PROG_DATA,
},
- .emit = brw_upload_cs_state
+ /* .emit = */ brw_upload_cs_state
};