lima/ppir: enable vectorize optimization

[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nve4_compute.c
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c

index f641f4777dfc187a4982d50f8bc073fd42875dd2..91c26718fbec3f758c0f4d69b46d3c4547a0e7f9 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -27,7 +27,7 @@
  
  #include "codegen/nv50_ir_driver.h"
  
-#ifdef DEBUG
+#ifndef NDEBUG
  static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
  static void gp100_compute_dump_launch_desc(const struct gp100_cp_launch_desc *);
  #endif
@@ -59,7 +59,8 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
        obj_class = GM200_COMPUTE_CLASS;
        break;
     case 0x130:
-      obj_class = dev->chipset == 0x130 ? GP100_COMPUTE_CLASS : GP104_COMPUTE_CLASS;
+      obj_class = (dev->chipset == 0x130 || dev->chipset == 0x13b) ?
+                      GP100_COMPUTE_CLASS : GP104_COMPUTE_CLASS;
        break;
     default:
        NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
@@ -392,23 +393,24 @@ nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
              uint64_t address
                 = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
  
-            assert(i > 0); /* we really only want uniform buffer objects */
-
-            BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
-            PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
-            BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
-            PUSH_DATA (push, 4 * 4);
-            PUSH_DATA (push, 0x1);
-            BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
-            PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
-
-            PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
-            PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
-            PUSH_DATA (push, nvc0->constbuf[5][i].size);
-            PUSH_DATA (push, 0);
-            BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
+            /* constbufs above 0 will are fetched via ubo info in the shader */
+            if (i > 0) {
+               BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+               PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+               PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+               BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+               PUSH_DATA (push, 4 * 4);
+               PUSH_DATA (push, 0x1);
+               BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
+               PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+               PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
+               PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
+               PUSH_DATA (push, nvc0->constbuf[s][i].size);
+               PUSH_DATA (push, 0);
+            }
  
+            BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
              res->cb_bindings[s] |= 1 << i;
           }
        }
@@ -550,6 +552,30 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
     return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
  }
  
+static void
+nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
+{
+   // only user constant buffers 0-6 can be put in the descriptor, the rest are
+   // loaded through global memory
+   for (int i = 0; i <= 6; i++) {
+      if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
+         continue;
+
+      struct nv04_resource *res =
+         nv04_resource(nvc0->constbuf[5][i].u.buf);
+
+      uint32_t base = res->offset + nvc0->constbuf[5][i].offset;
+      uint32_t size = nvc0->constbuf[5][i].size;
+      if (gp100)
+         gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
+      else
+         nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
+   }
+
+   // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
+   // nve4_compute_upload_input() does it later
+}
+
  static void
  nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
                                 struct nve4_cp_launch_desc *desc,
@@ -584,9 +610,15 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
     if (nvc0->constbuf[5][0].user || cp->parm_size) {
        nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
                                   NVC0_CB_USR_INFO(5), 1 << 16);
+
+      // Later logic will attempt to bind a real buffer at position 0. That
+      // should not happen if we've bound a user buffer.
+      assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
     }
     nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
                                NVC0_CB_AUX_INFO(5), 1 << 11);
+
+   nve4_compute_setup_buf_cb(nvc0, false, desc);
  }
  
  static void
@@ -622,9 +654,15 @@ gp100_compute_setup_launch_desc(struct nvc0_context *nvc0,
     if (nvc0->constbuf[5][0].user || cp->parm_size) {
        gp100_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
                                    NVC0_CB_USR_INFO(5), 1 << 16);
+
+      // Later logic will attempt to bind a real buffer at position 0. That
+      // should not happen if we've bound a user buffer.
+      assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
     }
     gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
                                 NVC0_CB_AUX_INFO(5), 1 << 11);
+
+   nve4_compute_setup_buf_cb(nvc0, true, desc);
  }
  
  static inline void *
@@ -667,6 +705,7 @@ void
  nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
  {
     struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nvc0_screen *screen = nvc0->screen;
     struct nouveau_pushbuf *push = nvc0->base.pushbuf;
     void *desc;
     uint64_t desc_gpuaddr;
@@ -702,7 +741,7 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
  
     nve4_compute_upload_input(nvc0, info);
  
-#ifdef DEBUG
+#ifndef NDEBUG
     if (debug_get_num_option("NV50_PROG_DEBUG", 0)) {
        if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
           gp100_compute_dump_launch_desc(desc);
@@ -740,6 +779,8 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
     }
  
     /* upload descriptor and flush */
+   nouveau_pushbuf_space(push, 32, 1, 0);
+   PUSH_REFN(push, screen->text, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD);
     BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
     PUSH_DATA (push, desc_gpuaddr >> 8);
     BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
@@ -747,6 +788,8 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
     BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
     PUSH_DATA (push, 0);
  
+   nvc0_update_compute_invocations_counter(nvc0, info);
+
  out:
     if (ret)
        NOUVEAU_ERR("Failed to launch grid !\n");
@@ -835,7 +878,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
  }
  
  
-#ifdef DEBUG
+#ifndef NDEBUG
  static const char *nve4_cache_split_name(unsigned value)
  {
     switch (value) {