nvc0: use sched control codes for gm107 blitter shader

[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_compute.c
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c

index 0fe6353b2ab5255f7587892224ef7823b2c387fd..11635c94658b01b08c3621e9a78e9cbe098c5431 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -23,7 +23,8 @@
   */
  
  #include "nvc0/nvc0_context.h"
-#include "nvc0/nvc0_compute.h"
+
+#include "nvc0/nvc0_compute.xml.h"
  
  int
  nvc0_screen_compute_setup(struct nvc0_screen *screen,
@@ -54,88 +55,135 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen,
        return ret;
     }
  
-   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
-                        &screen->parm);
-   if (ret)
-      return ret;
-
-   BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
     PUSH_DATA (push, screen->compute->oclass);
  
     /* hardware limit */
-   BEGIN_NVC0(push, NVC0_COMPUTE(MP_LIMIT), 1);
+   BEGIN_NVC0(push, NVC0_CP(MP_LIMIT), 1);
     PUSH_DATA (push, screen->mp_count);
-   BEGIN_NVC0(push, NVC0_COMPUTE(CALL_LIMIT_LOG), 1);
+   BEGIN_NVC0(push, NVC0_CP(CALL_LIMIT_LOG), 1);
     PUSH_DATA (push, 0xf);
  
-   BEGIN_NVC0(push, SUBC_COMPUTE(0x02a0), 1);
+   BEGIN_NVC0(push, SUBC_CP(0x02a0), 1);
     PUSH_DATA (push, 0x8000);
  
     /* global memory setup */
-   BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1);
+   BEGIN_NVC0(push, SUBC_CP(0x02c4), 1);
     PUSH_DATA (push, 0);
-   BEGIN_NIC0(push, NVC0_COMPUTE(GLOBAL_BASE), 0x100);
+   BEGIN_NIC0(push, NVC0_CP(GLOBAL_BASE), 0x100);
     for (i = 0; i <= 0xff; i++)
        PUSH_DATA (push, (0xc << 28) | (i << 16) | i);
-   BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1);
+   BEGIN_NVC0(push, SUBC_CP(0x02c4), 1);
     PUSH_DATA (push, 1);
  
     /* local memory and cstack setup */
-   BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_ADDRESS_HIGH), 2);
+   BEGIN_NVC0(push, NVC0_CP(TEMP_ADDRESS_HIGH), 2);
     PUSH_DATAh(push, screen->tls->offset);
     PUSH_DATA (push, screen->tls->offset);
-   BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_SIZE_HIGH), 2);
+   BEGIN_NVC0(push, NVC0_CP(TEMP_SIZE_HIGH), 2);
     PUSH_DATAh(push, screen->tls->size);
     PUSH_DATA (push, screen->tls->size);
-   BEGIN_NVC0(push, NVC0_COMPUTE(WARP_TEMP_ALLOC), 1);
+   BEGIN_NVC0(push, NVC0_CP(WARP_TEMP_ALLOC), 1);
     PUSH_DATA (push, 0);
-   BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_BASE), 1);
-   PUSH_DATA (push, 1 << 24);
+   BEGIN_NVC0(push, NVC0_CP(LOCAL_BASE), 1);
+   PUSH_DATA (push, 0xff << 24);
  
     /* shared memory setup */
-   BEGIN_NVC0(push, NVC0_COMPUTE(CACHE_SPLIT), 1);
+   BEGIN_NVC0(push, NVC0_CP(CACHE_SPLIT), 1);
     PUSH_DATA (push, NVC0_COMPUTE_CACHE_SPLIT_48K_SHARED_16K_L1);
-   BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_BASE), 1);
-   PUSH_DATA (push, 2 << 24);
-   BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 1);
+   BEGIN_NVC0(push, NVC0_CP(SHARED_BASE), 1);
+   PUSH_DATA (push, 0xfe << 24);
+   BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 1);
     PUSH_DATA (push, 0);
  
     /* code segment setup */
-   BEGIN_NVC0(push, NVC0_COMPUTE(CODE_ADDRESS_HIGH), 2);
+   BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2);
     PUSH_DATAh(push, screen->text->offset);
     PUSH_DATA (push, screen->text->offset);
  
-   /* TODO: textures & samplers */
+   /* textures */
+   BEGIN_NVC0(push, NVC0_CP(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
+
+   /* samplers */
+   BEGIN_NVC0(push, NVC0_CP(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
+
+   /* MS sample coordinate offsets */
+   BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
+   PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
+   BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 2 * 8);
+   PUSH_DATA (push, NVC0_CB_AUX_MS_INFO);
+   PUSH_DATA (push, 0); /* 0 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1); /* 1 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0); /* 2 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 1); /* 3 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 2); /* 4 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 3); /* 5 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 2); /* 6 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 3); /* 7 */
+   PUSH_DATA (push, 1);
  
     return 0;
  }
  
-bool
-nvc0_compute_validate_program(struct nvc0_context *nvc0)
+static void
+nvc0_compute_validate_samplers(struct nvc0_context *nvc0)
  {
-   struct nvc0_program *prog = nvc0->compprog;
+   bool need_flush = nvc0_validate_tsc(nvc0, 5);
+   if (need_flush) {
+      BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(TSC_FLUSH), 1);
+      PUSH_DATA (nvc0->base.pushbuf, 0);
+   }
  
-   if (prog->mem)
-      return true;
+   /* Invalidate all 3D samplers because they are aliased. */
+   for (int s = 0; s < 5; s++)
+      nvc0->samplers_dirty[s] = ~0;
+   nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS;
+}
  
-   if (!prog->translated) {
-      prog->translated = nvc0_program_translate(
-         prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
-      if (!prog->translated)
-         return false;
+static void
+nvc0_compute_validate_textures(struct nvc0_context *nvc0)
+{
+   bool need_flush = nvc0_validate_tic(nvc0, 5);
+   if (need_flush) {
+      BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(TIC_FLUSH), 1);
+      PUSH_DATA (nvc0->base.pushbuf, 0);
     }
-   if (unlikely(!prog->code_size))
-      return false;
-
-   if (likely(prog->code_size)) {
-      if (nvc0_program_upload_code(nvc0, prog)) {
-         struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-         BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
-         PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE);
-         return true;
-      }
+
+   /* Invalidate all 3D textures because they are aliased. */
+   for (int s = 0; s < 5; s++) {
+      for (int i = 0; i < nvc0->num_textures[s]; i++)
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i));
+      nvc0->textures_dirty[s] = ~0;
+   }
+   nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES;
+}
+
+static inline void
+nvc0_compute_invalidate_constbufs(struct nvc0_context *nvc0)
+{
+   int s;
+
+   /* Invalidate all 3D constbufs because they are aliased with COMPUTE. */
+   for (s = 0; s < 5; s++) {
+      nvc0->constbuf_dirty[s] |= nvc0->constbuf_valid[s];
+      nvc0->state.uniform_buffer_bound[s] = 0;
     }
-   return false;
+   nvc0->dirty_3d |= NVC0_NEW_3D_CONSTBUF;
  }
  
  static void
@@ -150,7 +198,7 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0)
  
        if (nvc0->constbuf[s][i].user) {
           struct nouveau_bo *bo = nvc0->screen->uniform_bo;
-         const unsigned base = s << 16;
+         const unsigned base = NVC0_CB_USR_INFO(s);
           const unsigned size = nvc0->constbuf[s][0].size;
           assert(i == 0); /* we really only want OpenGL uniforms here */
           assert(nvc0->constbuf[s][0].u.data);
@@ -158,11 +206,11 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0)
           if (nvc0->state.uniform_buffer_bound[s] < size) {
              nvc0->state.uniform_buffer_bound[s] = align(size, 0x100);
  
-            BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3);
+            BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
              PUSH_DATA (push, nvc0->state.uniform_buffer_bound[s]);
              PUSH_DATAh(push, bo->offset + base);
              PUSH_DATA (push, bo->offset + base);
-            BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1);
+            BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
              PUSH_DATA (push, (0 << 8) | 1);
           }
           nvc0_cb_bo_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base),
@@ -173,18 +221,18 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0)
           struct nv04_resource *res =
              nv04_resource(nvc0->constbuf[s][i].u.buf);
           if (res) {
-            BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3);
+            BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
              PUSH_DATA (push, nvc0->constbuf[s][i].size);
              PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
              PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
-            BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1);
+            BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
              PUSH_DATA (push, (i << 8) | 1);
  
              BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
  
              res->cb_bindings[s] |= 1 << i;
           } else {
-            BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1);
+            BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
              PUSH_DATA (push, (i << 8) | 0);
           }
           if (i == 0)
@@ -192,54 +240,184 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0)
        }
     }
  
-   BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
+   nvc0_compute_invalidate_constbufs(nvc0);
+
+   BEGIN_NVC0(push, NVC0_CP(FLUSH), 1);
     PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB);
  }
  
-static bool
-nvc0_compute_state_validate(struct nvc0_context *nvc0)
+static void
+nvc0_compute_validate_driverconst(struct nvc0_context *nvc0)
  {
-   if (!nvc0_compute_validate_program(nvc0))
-      return false;
-   if (nvc0->dirty_cp & NVC0_NEW_CP_CONSTBUF)
-      nvc0_compute_validate_constbufs(nvc0);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
  
-   /* TODO: textures, samplers, surfaces, global memory buffers */
+   BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
+   PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
+   BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
+   PUSH_DATA (push, (15 << 8) | 1);
  
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
+   nvc0->dirty_3d |= NVC0_NEW_3D_DRIVERCONST;
+}
  
-   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
-   if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return false;
-   if (unlikely(nvc0->state.flushed))
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
+static void
+nvc0_compute_validate_buffers(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
+   const int s = 5;
+   int i;
+
+   BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
+   PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+   BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS);
+   PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0));
+
+   for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
+      if (nvc0->buffers[s][i].buffer) {
+         struct nv04_resource *res =
+            nv04_resource(nvc0->buffers[s][i].buffer);
+         PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
+         PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
+         PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
+         PUSH_DATA (push, 0);
+         BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
+         util_range_add(&res->valid_buffer_range,
+                        nvc0->buffers[s][i].buffer_offset,
+                        nvc0->buffers[s][i].buffer_offset +
+                        nvc0->buffers[s][i].buffer_size);
+      } else {
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+      }
+   }
+}
+
+void
+nvc0_compute_validate_globals(struct nvc0_context *nvc0)
+{
+   unsigned i;
+
+   for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource *res = *util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, i);
+      if (res)
+         nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL,
+                           nv04_resource(res), NOUVEAU_BO_RDWR);
+   }
+}
  
-   return true;
+static inline void
+nvc0_compute_invalidate_surfaces(struct nvc0_context *nvc0, const int s)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   int i;
  
+   for (i = 0; i < NVC0_MAX_IMAGES; ++i) {
+      if (s == 5)
+         BEGIN_NVC0(push, NVC0_CP(IMAGE(i)), 6);
+      else
+         BEGIN_NVC0(push, NVC0_3D(IMAGE(i)), 6);
+      PUSH_DATA(push, 0);
+      PUSH_DATA(push, 0);
+      PUSH_DATA(push, 0);
+      PUSH_DATA(push, 0);
+      PUSH_DATA(push, 0x14000);
+      PUSH_DATA(push, 0);
+   }
+}
+
+static void
+nvc0_compute_validate_surfaces(struct nvc0_context *nvc0)
+{
+   /* TODO: Invalidating both 3D and CP surfaces before validating surfaces for
+    * compute is probably not really necessary, but we didn't find any better
+    * solutions for now. This fixes some invalidation issues when compute and
+    * fragment shaders are used inside the same context. Anyway, we definitely
+    * have invalidation issues between 3D and CP for other resources like SSBO
+    * and atomic counters. */
+   nvc0_compute_invalidate_surfaces(nvc0, 4);
+   nvc0_compute_invalidate_surfaces(nvc0, 5);
+
+   nvc0_validate_suf(nvc0, 5);
+
+   /* Invalidate all FRAGMENT images because they are aliased with COMPUTE. */
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_SUF);
+   nvc0->dirty_3d |= NVC0_NEW_3D_SURFACES;
+   nvc0->images_dirty[4] |= nvc0->images_valid[4];
+}
+
+static struct nvc0_state_validate
+validate_list_cp[] = {
+   { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
+   { nvc0_compute_validate_constbufs,     NVC0_NEW_CP_CONSTBUF    },
+   { nvc0_compute_validate_driverconst,   NVC0_NEW_CP_DRIVERCONST },
+   { nvc0_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
+   { nvc0_compute_validate_textures,      NVC0_NEW_CP_TEXTURES    },
+   { nvc0_compute_validate_samplers,      NVC0_NEW_CP_SAMPLERS    },
+   { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
+   { nvc0_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
+};
+
+static bool
+nvc0_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
+{
+   bool ret;
+
+   ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
+                             ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
+                             nvc0->bufctx_cp);
+
+   if (unlikely(nvc0->state.flushed))
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
+   return ret;
  }
  
  static void
-nvc0_compute_upload_input(struct nvc0_context *nvc0, const void *input)
+nvc0_compute_upload_input(struct nvc0_context *nvc0,
+                          const struct pipe_grid_info *info)
  {
     struct nouveau_pushbuf *push = nvc0->base.pushbuf;
     struct nvc0_screen *screen = nvc0->screen;
     struct nvc0_program *cp = nvc0->compprog;
  
     if (cp->parm_size) {
-      BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3);
+      struct nouveau_bo *bo = screen->uniform_bo;
+      const unsigned base = NVC0_CB_USR_INFO(5);
+
+      BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
        PUSH_DATA (push, align(cp->parm_size, 0x100));
-      PUSH_DATAh(push, screen->parm->offset);
-      PUSH_DATA (push, screen->parm->offset);
-      BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1);
+      PUSH_DATAh(push, bo->offset + base);
+      PUSH_DATA (push, bo->offset + base);
+      BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
        PUSH_DATA (push, (0 << 8) | 1);
        /* NOTE: size is limited to 4 KiB, which is < NV04_PFIFO_MAX_PACKET_LEN */
-      BEGIN_1IC0(push, NVC0_COMPUTE(CB_POS), 1 + cp->parm_size / 4);
+      BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + cp->parm_size / 4);
        PUSH_DATA (push, 0);
-      PUSH_DATAp(push, input, cp->parm_size / 4);
+      PUSH_DATAp(push, info->input, cp->parm_size / 4);
  
-      BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
-      PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB);
+      nvc0_compute_invalidate_constbufs(nvc0);
     }
+
+   BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
+   PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
+
+   BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 1);
+   /* (7) as we only upload work_dim on nvc0, the rest uses special regs */
+   PUSH_DATA (push, NVC0_CB_AUX_GRID_INFO(7));
+   PUSH_DATA (push, info->work_dim);
+
+   BEGIN_NVC0(push, NVC0_CP(FLUSH), 1);
+   PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB);
  }
  
  void
@@ -248,64 +426,76 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
     struct nvc0_context *nvc0 = nvc0_context(pipe);
     struct nouveau_pushbuf *push = nvc0->base.pushbuf;
     struct nvc0_program *cp = nvc0->compprog;
-   unsigned s;
     int ret;
  
-   ret = !nvc0_compute_state_validate(nvc0);
+   ret = !nvc0_state_validate_cp(nvc0, ~0);
     if (ret) {
        NOUVEAU_ERR("Failed to launch grid !\n");
        return;
     }
  
-   nvc0_compute_upload_input(nvc0, info->input);
+   nvc0_compute_upload_input(nvc0, info);
  
-   BEGIN_NVC0(push, NVC0_COMPUTE(CP_START_ID), 1);
+   BEGIN_NVC0(push, NVC0_CP(CP_START_ID), 1);
     PUSH_DATA (push, nvc0_program_symbol_offset(cp, info->pc));
  
-   BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_POS_ALLOC), 3);
-   PUSH_DATA (push, align(cp->cp.lmem_size, 0x10));
+   BEGIN_NVC0(push, NVC0_CP(LOCAL_POS_ALLOC), 3);
+   PUSH_DATA (push, (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10));
     PUSH_DATA (push, 0);
     PUSH_DATA (push, 0x800); /* WARP_CSTACK_SIZE */
  
-   BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 3);
+   BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 3);
     PUSH_DATA (push, align(cp->cp.smem_size, 0x100));
     PUSH_DATA (push, info->block[0] * info->block[1] * info->block[2]);
     PUSH_DATA (push, cp->num_barriers);
-   BEGIN_NVC0(push, NVC0_COMPUTE(CP_GPR_ALLOC), 1);
+   BEGIN_NVC0(push, NVC0_CP(CP_GPR_ALLOC), 1);
     PUSH_DATA (push, cp->num_gprs);
  
-   /* grid/block setup */
-   BEGIN_NVC0(push, NVC0_COMPUTE(GRIDDIM_YX), 2);
-   PUSH_DATA (push, (info->grid[1] << 16) | info->grid[0]);
-   PUSH_DATA (push, info->grid[2]);
-   BEGIN_NVC0(push, NVC0_COMPUTE(BLOCKDIM_YX), 2);
-   PUSH_DATA (push, (info->block[1] << 16) | info->block[0]);
-   PUSH_DATA (push, info->block[2]);
-
     /* launch preliminary setup */
-   BEGIN_NVC0(push, NVC0_COMPUTE(GRIDID), 1);
+   BEGIN_NVC0(push, NVC0_CP(GRIDID), 1);
     PUSH_DATA (push, 0x1);
-   BEGIN_NVC0(push, SUBC_COMPUTE(0x036c), 1);
+   BEGIN_NVC0(push, SUBC_CP(0x036c), 1);
     PUSH_DATA (push, 0);
-   BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
+   BEGIN_NVC0(push, NVC0_CP(FLUSH), 1);
     PUSH_DATA (push, NVC0_COMPUTE_FLUSH_GLOBAL | NVC0_COMPUTE_FLUSH_UNK8);
  
-   /* kernel launching */
-   BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_BEGIN), 1);
-   PUSH_DATA (push, 0);
-   BEGIN_NVC0(push, SUBC_COMPUTE(0x0a08), 1);
-   PUSH_DATA (push, 0);
-   BEGIN_NVC0(push, NVC0_COMPUTE(LAUNCH), 1);
-   PUSH_DATA (push, 0x1000);
-   BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_END), 1);
-   PUSH_DATA (push, 0);
-   BEGIN_NVC0(push, SUBC_COMPUTE(0x0360), 1);
-   PUSH_DATA (push, 0x1);
+   /* block setup */
+   BEGIN_NVC0(push, NVC0_CP(BLOCKDIM_YX), 2);
+   PUSH_DATA (push, (info->block[1] << 16) | info->block[0]);
+   PUSH_DATA (push, info->block[2]);
  
-   /* Invalidate all 3D constbufs because they are aliased with COMPUTE. */
-   nvc0->dirty |= NVC0_NEW_CONSTBUF;
-   for (s = 0; s < 5; s++) {
-      nvc0->constbuf_dirty[s] |= nvc0->constbuf_valid[s];
-      nvc0->state.uniform_buffer_bound[s] = 0;
+   if (unlikely(info->indirect)) {
+      struct nv04_resource *res = nv04_resource(info->indirect);
+      uint32_t offset = res->offset + info->indirect_offset;
+      unsigned macro = NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT;
+
+      nouveau_pushbuf_space(push, 16, 0, 1);
+      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+      PUSH_DATA(push, NVC0_FIFO_PKHDR_1I(1, macro, 3));
+      nouveau_pushbuf_data(push, res->bo, offset,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
+   } else {
+      /* grid setup */
+      BEGIN_NVC0(push, NVC0_CP(GRIDDIM_YX), 2);
+      PUSH_DATA (push, (info->grid[1] << 16) | info->grid[0]);
+      PUSH_DATA (push, info->grid[2]);
+
+      /* kernel launching */
+      BEGIN_NVC0(push, NVC0_CP(COMPUTE_BEGIN), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NVC0(push, SUBC_CP(0x0a08), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NVC0(push, NVC0_CP(LAUNCH), 1);
+      PUSH_DATA (push, 0x1000);
+      BEGIN_NVC0(push, NVC0_CP(COMPUTE_END), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NVC0(push, SUBC_CP(0x0360), 1);
+      PUSH_DATA (push, 0x1);
     }
+
+   /* TODO: Not sure if this is really necessary. */
+   nvc0_compute_invalidate_surfaces(nvc0, 5);
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF);
+   nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES;
+   nvc0->images_dirty[5] |= nvc0->images_valid[5];
  }