i965/cs: Implement brw_emit_gpgpu_walker
authorJordan Justen <jordan.l.justen@intel.com>
Thu, 28 Aug 2014 21:47:19 +0000 (14:47 -0700)
committerJordan Justen <jordan.l.justen@intel.com>
Sat, 2 May 2015 07:50:00 +0000 (00:50 -0700)
Tested on Ivybridge, Haswell and Broadwell.

v2:
 * Use SET_FIELD. (Ken)
 * Use simd_size / 16 to support SIMD8/16/32. Ken suggested
   that we might be able to do it arithmetically rather than just
   supporting SIMD8 and SIMD16 with a conditional.

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/mesa/drivers/dri/i965/brw_compute.c
src/mesa/drivers/dri/i965/brw_defines.h

index baed7010a3d2a60fe05ecb103b86c74fb29d3304..044deae83c989e03a19cc30f55572051a7502f65 100644 (file)
 #include "brw_draw.h"
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
+#include "brw_defines.h"
 
 
 static void
 brw_emit_gpgpu_walker(struct brw_context *brw, const GLuint *num_groups)
 {
-   _mesa_problem(&brw->ctx, "TODO: implement brw_emit_gpgpu_walker");
+   const struct brw_cs_prog_data *prog_data = brw->cs.prog_data;
+
+   const unsigned simd_size = prog_data->simd_size;
+   unsigned group_size = prog_data->local_size[0] *
+      prog_data->local_size[1] * prog_data->local_size[2];
+   unsigned thread_width_max =
+      (group_size + simd_size - 1) / simd_size;
+
+   uint32_t right_mask = (1u << simd_size) - 1;
+   const unsigned right_non_aligned = group_size & (simd_size - 1);
+   if (right_non_aligned != 0)
+      right_mask >>= (simd_size - right_non_aligned);
+
+   uint32_t dwords = brw->gen < 8 ? 11 : 15;
+   BEGIN_BATCH(dwords);
+   OUT_BATCH(GPGPU_WALKER << 16 | (dwords - 2));
+   OUT_BATCH(0);
+   if (brw->gen >= 8) {
+      OUT_BATCH(0);                     /* Indirect Data Length */
+      OUT_BATCH(0);                     /* Indirect Data Start Address */
+   }
+   assert(thread_width_max <= brw->max_cs_threads);
+   OUT_BATCH(SET_FIELD(simd_size / 16, GPGPU_WALKER_SIMD_SIZE) |
+             SET_FIELD(thread_width_max - 1, GPGPU_WALKER_THREAD_WIDTH_MAX));
+   OUT_BATCH(0);                        /* Thread Group ID Starting X */
+   if (brw->gen >= 8)
+      OUT_BATCH(0);                     /* MBZ */
+   OUT_BATCH(num_groups[0]);            /* Thread Group ID X Dimension */
+   OUT_BATCH(0);                        /* Thread Group ID Starting Y */
+   if (brw->gen >= 8)
+      OUT_BATCH(0);                     /* MBZ */
+   OUT_BATCH(num_groups[1]);            /* Thread Group ID Y Dimension */
+   OUT_BATCH(0);                        /* Thread Group ID Starting/Resume Z */
+   OUT_BATCH(num_groups[2]);            /* Thread Group ID Z Dimension */
+   OUT_BATCH(right_mask);               /* Right Execution Mask */
+   OUT_BATCH(0xffffffff);               /* Bottom Execution Mask */
+   ADVANCE_BATCH();
 }
 
 
index d4b5b2496143b3a66474cdc3ef4e337a180c324d..541798457716b16c10352d8c20828186bd852f69 100644 (file)
@@ -2469,5 +2469,18 @@ enum brw_wm_barycentric_interp_mode {
 # define MEDIA_VFE_STATE_CURBE_ALLOC_MASK       INTEL_MASK(15, 0)
 
 #define MEDIA_INTERFACE_DESCRIPTOR_LOAD         0x7002
+#define GPGPU_WALKER                            0x7105
+/* GEN8+ DW2 */
+# define GPGPU_WALKER_INDIRECT_LENGTH_SHIFT     0
+# define GPGPU_WALKER_INDIRECT_LENGTH_MASK      INTEL_MASK(15, 0)
+/* GEN7 DW2, GEN8+ DW4 */
+# define GPGPU_WALKER_SIMD_SIZE_SHIFT           30
+# define GPGPU_WALKER_SIMD_SIZE_MASK            INTEL_MASK(31, 30)
+# define GPGPU_WALKER_THREAD_DEPTH_MAX_SHIFT    16
+# define GPGPU_WALKER_THREAD_DEPTH_MAX_MASK     INTEL_MASK(21, 16)
+# define GPGPU_WALKER_THREAD_HEIGHT_MAX_SHIFT   8
+# define GPGPU_WALKER_THREAD_HEIGHT_MAX_MASK    INTEL_MASK(31, 8)
+# define GPGPU_WALKER_THREAD_WIDTH_MAX_SHIFT    0
+# define GPGPU_WALKER_THREAD_WIDTH_MAX_MASK     INTEL_MASK(5, 0)
 
 #endif