i965/nir/vec4: Prepare source and destination registers for ALU operations
[mesa.git] / src / mesa / drivers / dri / i965 / brw_cs.cpp
index e2f3d6310dde789b520a5ecd39151be53d92a0c0..29ee75b1e1a855575363f53a7ce13612e01e16af 100644 (file)
@@ -82,15 +82,21 @@ brw_cs_emit(struct brw_context *brw,
    prog_data->local_size[0] = cp->LocalSize[0];
    prog_data->local_size[1] = cp->LocalSize[1];
    prog_data->local_size[2] = cp->LocalSize[2];
-   int local_workgroup_size =
+   unsigned local_workgroup_size =
       cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
 
    cfg_t *cfg = NULL;
    const char *fail_msg = NULL;
 
+   int st_index = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      st_index = brw_get_shader_time_index(brw, prog, &cp->Base, ST_CS);
+
    /* Now the main event: Visit the shader IR and generate our CS IR for it.
     */
-   fs_visitor v8(brw, mem_ctx, key, prog_data, prog, cp, 8);
+   fs_visitor v8(brw->intelScreen->compiler, brw,
+                 mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
+                 &cp->Base, 8, st_index);
    if (!v8.run_cs()) {
       fail_msg = v8.fail_msg;
    } else if (local_workgroup_size <= 8 * brw->max_cs_threads) {
@@ -98,7 +104,9 @@ brw_cs_emit(struct brw_context *brw,
       prog_data->simd_size = 8;
    }
 
-   fs_visitor v16(brw, mem_ctx, key, prog_data, prog, cp, 16);
+   fs_visitor v16(brw->intelScreen->compiler, brw,
+                  mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
+                  &cp->Base, 16, st_index);
    if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
        !fail_msg && !v8.simd16_unsupported &&
        local_workgroup_size <= 16 * brw->max_cs_threads) {
@@ -126,7 +134,8 @@ brw_cs_emit(struct brw_context *brw,
       return NULL;
    }
 
-   fs_generator g(brw, mem_ctx, (void*) key, &prog_data->base, &cp->Base,
+   fs_generator g(brw->intelScreen->compiler, brw,
+                  mem_ctx, (void*) key, &prog_data->base, &cp->Base,
                   v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
    if (INTEL_DEBUG & DEBUG_CS) {
       char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
@@ -212,11 +221,9 @@ brw_codegen_cs_prog(struct brw_context *brw,
 static void
 brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key)
 {
-   struct gl_context *ctx = &brw->ctx;
    /* BRW_NEW_COMPUTE_PROGRAM */
    const struct brw_compute_program *cp =
       (struct brw_compute_program *) brw->compute_program;
-   const struct gl_program *prog = (struct gl_program *) cp;
 
    memset(key, 0, sizeof(*key));
 
@@ -284,6 +291,17 @@ brw_cs_precompile(struct gl_context *ctx,
 }
 
 
+static unsigned
+get_cs_thread_count(const struct brw_cs_prog_data *cs_prog_data)
+{
+   const unsigned simd_size = cs_prog_data->simd_size;
+   unsigned group_size = cs_prog_data->local_size[0] *
+      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
+
+   return (group_size + simd_size - 1) / simd_size;
+}
+
+
 static void
 brw_upload_cs_state(struct brw_context *brw)
 {
@@ -309,6 +327,8 @@ brw_upload_cs_state(struct brw_context *brw)
                                             prog_data->binding_table.size_bytes,
                                             32, &stage_state->bind_bo_offset);
 
+   unsigned threads = get_cs_thread_count(cs_prog_data);
+
    uint32_t dwords = brw->gen < 8 ? 8 : 9;
    BEGIN_BATCH(dwords);
    OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));
@@ -358,6 +378,13 @@ brw_upload_cs_state(struct brw_context *brw)
    desc[dw++] = 0;
    desc[dw++] = 0;
    desc[dw++] = stage_state->bind_bo_offset;
+   desc[dw++] = 0;
+   const uint32_t media_threads =
+      brw->gen >= 8 ?
+      SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
+      SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT);
+   assert(threads <= brw->max_cs_threads);
+   desc[dw++] = media_threads;
 
    BEGIN_BATCH(4);
    OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
@@ -370,9 +397,11 @@ brw_upload_cs_state(struct brw_context *brw)
 
 extern "C"
 const struct brw_tracked_state brw_cs_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_CS_PROG_DATA,
+   /* explicit initialisers aren't valid C++, comment
+    * them for documentation purposes */
+   /* .dirty = */{
+      /* .mesa = */ 0,
+      /* .brw = */  BRW_NEW_CS_PROG_DATA,
    },
-   .emit = brw_upload_cs_state
+   /* .emit = */ brw_upload_cs_state
 };