i965: Fix execution size of scalar TCS barrier setup code.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_cs.c
diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c

index 9c92faa6db825dcc0dde56dc813e22ba21964fb5..6685acde9d61a01f25719f4a522a320e9aa69e48 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@@ -32,42 +32,7 @@
  #include "intel_batchbuffer.h"
  #include "brw_nir.h"
  #include "brw_program.h"
-
-void
-brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
-                             void *buffer, uint32_t threads, uint32_t stride)
-{
-   if (prog_data->local_invocation_id_regs == 0)
-      return;
-
-   /* 'stride' should be an integer number of registers, that is, a multiple
-    * of 32 bytes.
-    */
-   assert(stride % 32 == 0);
-
-   unsigned x = 0, y = 0, z = 0;
-   for (unsigned t = 0; t < threads; t++) {
-      uint32_t *param = (uint32_t *) buffer + stride * t / 4;
-
-      for (unsigned i = 0; i < prog_data->simd_size; i++) {
-         param[0 * prog_data->simd_size + i] = x;
-         param[1 * prog_data->simd_size + i] = y;
-         param[2 * prog_data->simd_size + i] = z;
-
-         x++;
-         if (x == prog_data->local_size[0]) {
-            x = 0;
-            y++;
-            if (y == prog_data->local_size[1]) {
-               y = 0;
-               z++;
-               if (z == prog_data->local_size[2])
-                  z = 0;
-            }
-         }
-      }
-   }
-}
+#include "compiler/glsl/ir_uniform.h"
  
  static void
  assign_cs_binding_table_offsets(const struct brw_device_info *devinfo,
@@ -106,6 +71,19 @@ brw_codegen_cs_prog(struct brw_context *brw,
  
     memset(&prog_data, 0, sizeof(prog_data));
  
+   if (prog->Comp.SharedSize > 64 * 1024) {
+      prog->LinkStatus = false;
+      const char *error_str =
+         "Compute shader used more than 64KB of shared variables";
+      ralloc_strcat(&prog->InfoLog, error_str);
+      _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str);
+
+      ralloc_free(mem_ctx);
+      return false;
+   } else {
+      prog_data.base.total_shared = prog->Comp.SharedSize;
+   }
+
     assign_cs_binding_table_offsets(brw->intelScreen->devinfo, prog,
                                     &cp->program.Base, &prog_data);
  
@@ -113,7 +91,10 @@ brw_codegen_cs_prog(struct brw_context *brw,
      * prog_data associated with the compiled program, and which will be freed
      * by the state cache.
      */
-   int param_count = cp->program.Base.nir->num_uniforms;
+   int param_count = cp->program.Base.nir->num_uniforms / 4;
+
+   /* The backend also sometimes add a param for the thread local id. */
+   prog_data.thread_local_id_index = param_count++;
  
     /* The backend also sometimes adds params for texture size. */
     param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
@@ -167,10 +148,29 @@ brw_codegen_cs_prog(struct brw_context *brw,
        }
     }
  
-   if (prog_data.base.total_scratch) {
-      brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo,
-                         prog_data.base.total_scratch * brw->max_cs_threads);
-   }
+   const unsigned subslices = MAX2(brw->intelScreen->subslice_total, 1);
+
+   /* WaCSScratchSize:hsw
+    *
+    * Haswell's scratch space address calculation appears to be sparse
+    * rather than tightly packed.  The Thread ID has bits indicating
+    * which subslice, EU within a subslice, and thread within an EU
+    * it is.  There's a maximum of two slices and two subslices, so these
+    * can be stored with a single bit.  Even though there are only 10 EUs
+    * per subslice, this is stored in 4 bits, so there's an effective
+    * maximum value of 16 EUs.  Similarly, although there are only 7
+    * threads per EU, this is stored in a 3 bit number, giving an effective
+    * maximum value of 8 threads per EU.
+    *
+    * This means that we need to use 16 * 8 instead of 10 * 7 for the
+    * number of threads per subslice.
+    */
+   const unsigned scratch_ids_per_subslice =
+      brw->is_haswell ? 16 * 8 : brw->max_cs_threads;
+
+   brw_alloc_stage_scratch(brw, &brw->cs.base,
+                           prog_data.base.total_scratch,
+                           scratch_ids_per_subslice * subslices);
  
     if (unlikely(INTEL_DEBUG & DEBUG_CS))
        fprintf(stderr, "\n");
@@ -198,8 +198,7 @@ brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key)
     memset(key, 0, sizeof(*key));
  
     /* _NEW_TEXTURE */
-   brw_populate_sampler_prog_key_data(ctx, prog, brw->cs.base.sampler_count,
-                                      &key->tex);
+   brw_populate_sampler_prog_key_data(ctx, prog, &key->tex);
  
     /* The unique compute program ID */
     key->program_string_id = cp->id;
@@ -221,7 +220,7 @@ brw_upload_cs_prog(struct brw_context *brw)
        return;
  
     brw->cs.base.sampler_count =
-      _mesa_fls(ctx->ComputeProgram._Current->Base.SamplersUsed);
+      util_last_bit(ctx->ComputeProgram._Current->Base.SamplersUsed);
  
     brw_cs_populate_key(brw, &key);