i965/cs: Split out helper for building local id payload
authorKristian Høgsberg Kristensen <krh@bitplanet.net>
Tue, 6 Oct 2015 05:07:58 +0000 (22:07 -0700)
committerKristian Høgsberg Kristensen <krh@bitplanet.net>
Thu, 8 Oct 2015 19:15:02 +0000 (12:15 -0700)
The initial motivation for this patch was to avoid calling
brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
compiler. This commit ends up refactoring things a bit more so as to
split out the logic to build the local id payload to brw_fs.cpp. This
moves the payload building closer to the compiler code that uses the
payload layout and makes it available to other users of the compiler.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Signed-off-by: Kristian Høgsberg Kristensen <krh@bitplanet.net>
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_cs.h
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/gen7_cs_state.c

index 0a29a6920162fadb7499a7324b1863335291334a..1869f2843316c632dc9463bc6849ddcaf6f94003 100644 (file)
@@ -484,6 +484,7 @@ struct brw_cs_prog_data {
    unsigned simd_size;
    bool uses_barrier;
    bool uses_num_work_groups;
+   unsigned local_invocation_id_regs;
 
    struct {
       /** @{
index 0c0ed2bc909897a264bd8b643cfcdc3760bd8af6..c07eb6ca6ee603b131dad3c3a6f2e30fe0431ad4 100644 (file)
@@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
             struct gl_shader_program *prog,
             unsigned *final_assembly_size);
 
-unsigned
-brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
+                             void *buffer, uint32_t threads, uint32_t stride);
 
 #ifdef __cplusplus
 }
index 7c401535f886ab5a8ee70cc47690c64a8620b249..6ce157033404ed5390b4157a255208e8f8c23da0 100644 (file)
@@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
    payload.num_regs = 2;
 }
 
+/**
+ * We are building the local ID push constant data using the simplest possible
+ * method. We simply push the local IDs directly as they should appear in the
+ * registers for the uvec3 gl_LocalInvocationID variable.
+ *
+ * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
+ * registers worth of push constant space.
+ *
+ * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
+ * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
+ * to coordinated.
+ *
+ * FINISHME: There are a few easy optimizations to consider.
+ *
+ * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
+ *    no need for using push constant space for that dimension.
+ *
+ * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
+ *    easily use 16-bit words rather than 32-bit dwords in the push constant
+ *    data.
+ *
+ * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
+ *    conveying the data, and thereby reduce push constant usage.
+ *
+ */
 void
 fs_visitor::setup_cs_payload()
 {
    assert(devinfo->gen >= 7);
+   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
 
    payload.num_regs = 1;
 
    if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
-      const unsigned local_id_dwords =
-         brw_cs_prog_local_id_payload_dwords(dispatch_width);
-      assert((local_id_dwords & 0x7) == 0);
-      const unsigned local_id_regs = local_id_dwords / 8;
+      prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
       payload.local_invocation_id_reg = payload.num_regs;
-      payload.num_regs += local_id_regs;
+      payload.num_regs += prog_data->local_invocation_id_regs;
    }
 }
 
@@ -5171,6 +5194,42 @@ brw_wm_fs_emit(struct brw_context *brw,
    return g.get_assembly(final_assembly_size);
 }
 
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
+                             void *buffer, uint32_t threads, uint32_t stride)
+{
+   if (prog_data->local_invocation_id_regs == 0)
+      return;
+
+   /* 'stride' should be an integer number of registers, that is, a multiple
+    * of 32 bytes.
+    */
+   assert(stride % 32 == 0);
+
+   unsigned x = 0, y = 0, z = 0;
+   for (unsigned t = 0; t < threads; t++) {
+      uint32_t *param = (uint32_t *) buffer + stride * t / 4;
+
+      for (unsigned i = 0; i < prog_data->simd_size; i++) {
+         param[0 * prog_data->simd_size + i] = x;
+         param[1 * prog_data->simd_size + i] = y;
+         param[2 * prog_data->simd_size + i] = z;
+
+         x++;
+         if (x == prog_data->local_size[0]) {
+            x = 0;
+            y++;
+            if (y == prog_data->local_size[1]) {
+               y = 0;
+               z++;
+               if (z == prog_data->local_size[2])
+                  z = 0;
+            }
+         }
+      }
+   }
+}
+
 fs_reg *
 fs_visitor::emit_cs_local_invocation_id_setup()
 {
index 5edc4fc98423b08fae80cad6f34f9c7d021a81f1..6aeb0cb243fec4f89b55aa8c30cdcb052320896c 100644 (file)
@@ -70,10 +70,8 @@ brw_upload_cs_state(struct brw_context *brw)
 
    unsigned local_id_dwords = 0;
 
-   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
-      local_id_dwords =
-         brw_cs_prog_local_id_payload_dwords(cs_prog_data->simd_size);
-   }
+   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
+      local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
 
    unsigned push_constant_data_size =
       (prog_data->nr_params + local_id_dwords) * sizeof(gl_constant_value);
@@ -190,63 +188,6 @@ const struct brw_tracked_state brw_cs_state = {
 };
 
 
-/**
- * We are building the local ID push constant data using the simplest possible
- * method. We simply push the local IDs directly as they should appear in the
- * registers for the uvec3 gl_LocalInvocationID variable.
- *
- * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
- * registers worth of push constant space.
- *
- * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
- * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
- * to coordinated.
- *
- * FINISHME: There are a few easy optimizations to consider.
- *
- * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
- *    no need for using push constant space for that dimension.
- *
- * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
- *    easily use 16-bit words rather than 32-bit dwords in the push constant
- *    data.
- *
- * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
- *    conveying the data, and thereby reduce push constant usage.
- *
- */
-unsigned
-brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width)
-{
-   return 3 * dispatch_width;
-}
-
-
-static void
-fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
-                      void *buffer, unsigned *x, unsigned *y, unsigned *z)
-{
-   uint32_t *param = (uint32_t *)buffer;
-   for (unsigned i = 0; i < cs_prog_data->simd_size; i++) {
-      param[0 * cs_prog_data->simd_size + i] = *x;
-      param[1 * cs_prog_data->simd_size + i] = *y;
-      param[2 * cs_prog_data->simd_size + i] = *z;
-
-      (*x)++;
-      if (*x == cs_prog_data->local_size[0]) {
-         *x = 0;
-         (*y)++;
-         if (*y == cs_prog_data->local_size[1]) {
-            *y = 0;
-            (*z)++;
-            if (*z == cs_prog_data->local_size[2])
-               *z = 0;
-         }
-      }
-   }
-}
-
-
 /**
  * Creates a region containing the push constants for the CS on gen7+.
  *
@@ -269,10 +210,8 @@ brw_upload_cs_push_constants(struct brw_context *brw,
       (struct brw_stage_prog_data*) cs_prog_data;
    unsigned local_id_dwords = 0;
 
-   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
-      local_id_dwords =
-         brw_cs_prog_local_id_payload_dwords(cs_prog_data->simd_size);
-   }
+   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
+      local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
 
    /* Updates the ParamaterValues[i] pointers for all parameters of the
     * basic type of PROGRAM_STATE_VAR.
@@ -302,14 +241,13 @@ brw_upload_cs_push_constants(struct brw_context *brw,
 
       STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
 
+      brw_cs_fill_local_id_payload(cs_prog_data, param, threads,
+                                   reg_aligned_constant_size);
+
       /* _NEW_PROGRAM_CONSTANTS */
-      unsigned x = 0, y = 0, z = 0;
       for (t = 0; t < threads; t++) {
-         gl_constant_value *next_param = &param[t * param_aligned_count];
-         if (local_id_dwords > 0) {
-            fill_local_id_payload(cs_prog_data, (void*)next_param, &x, &y, &z);
-            next_param += local_id_dwords;
-         }
+         gl_constant_value *next_param =
+            &param[t * param_aligned_count + local_id_dwords];
          for (i = 0; i < prog_data->nr_params; i++) {
             next_param[i] = *prog_data->param[i];
          }