lima/ppir: enable vectorize optimization
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nve4_compute.c
index f641f4777dfc187a4982d50f8bc073fd42875dd2..91c26718fbec3f758c0f4d69b46d3c4547a0e7f9 100644 (file)
@@ -27,7 +27,7 @@
 
 #include "codegen/nv50_ir_driver.h"
 
-#ifdef DEBUG
+#ifndef NDEBUG
 static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
 static void gp100_compute_dump_launch_desc(const struct gp100_cp_launch_desc *);
 #endif
@@ -59,7 +59,8 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
       obj_class = GM200_COMPUTE_CLASS;
       break;
    case 0x130:
-      obj_class = dev->chipset == 0x130 ? GP100_COMPUTE_CLASS : GP104_COMPUTE_CLASS;
+      obj_class = (dev->chipset == 0x130 || dev->chipset == 0x13b) ?
+                      GP100_COMPUTE_CLASS : GP104_COMPUTE_CLASS;
       break;
    default:
       NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
@@ -392,23 +393,24 @@ nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
             uint64_t address
                = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
 
-            assert(i > 0); /* we really only want uniform buffer objects */
-
-            BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
-            PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
-            BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
-            PUSH_DATA (push, 4 * 4);
-            PUSH_DATA (push, 0x1);
-            BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
-            PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
-
-            PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
-            PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
-            PUSH_DATA (push, nvc0->constbuf[5][i].size);
-            PUSH_DATA (push, 0);
-            BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
+            /* constbufs above 0 will are fetched via ubo info in the shader */
+            if (i > 0) {
+               BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+               PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+               PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+               BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+               PUSH_DATA (push, 4 * 4);
+               PUSH_DATA (push, 0x1);
+               BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
+               PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+               PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
+               PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
+               PUSH_DATA (push, nvc0->constbuf[s][i].size);
+               PUSH_DATA (push, 0);
+            }
 
+            BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
             res->cb_bindings[s] |= 1 << i;
          }
       }
@@ -550,6 +552,30 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
    return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
 }
 
+static void
+nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
+{
+   // only user constant buffers 0-6 can be put in the descriptor, the rest are
+   // loaded through global memory
+   for (int i = 0; i <= 6; i++) {
+      if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
+         continue;
+
+      struct nv04_resource *res =
+         nv04_resource(nvc0->constbuf[5][i].u.buf);
+
+      uint32_t base = res->offset + nvc0->constbuf[5][i].offset;
+      uint32_t size = nvc0->constbuf[5][i].size;
+      if (gp100)
+         gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
+      else
+         nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
+   }
+
+   // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
+   // nve4_compute_upload_input() does it later
+}
+
 static void
 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
                                struct nve4_cp_launch_desc *desc,
@@ -584,9 +610,15 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
    if (nvc0->constbuf[5][0].user || cp->parm_size) {
       nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
                                  NVC0_CB_USR_INFO(5), 1 << 16);
+
+      // Later logic will attempt to bind a real buffer at position 0. That
+      // should not happen if we've bound a user buffer.
+      assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
    }
    nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
                               NVC0_CB_AUX_INFO(5), 1 << 11);
+
+   nve4_compute_setup_buf_cb(nvc0, false, desc);
 }
 
 static void
@@ -622,9 +654,15 @@ gp100_compute_setup_launch_desc(struct nvc0_context *nvc0,
    if (nvc0->constbuf[5][0].user || cp->parm_size) {
       gp100_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
                                   NVC0_CB_USR_INFO(5), 1 << 16);
+
+      // Later logic will attempt to bind a real buffer at position 0. That
+      // should not happen if we've bound a user buffer.
+      assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
    }
    gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
                                NVC0_CB_AUX_INFO(5), 1 << 11);
+
+   nve4_compute_setup_buf_cb(nvc0, true, desc);
 }
 
 static inline void *
@@ -667,6 +705,7 @@ void
 nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nvc0_screen *screen = nvc0->screen;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    void *desc;
    uint64_t desc_gpuaddr;
@@ -702,7 +741,7 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
 
    nve4_compute_upload_input(nvc0, info);
 
-#ifdef DEBUG
+#ifndef NDEBUG
    if (debug_get_num_option("NV50_PROG_DEBUG", 0)) {
       if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
          gp100_compute_dump_launch_desc(desc);
@@ -740,6 +779,8 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    }
 
    /* upload descriptor and flush */
+   nouveau_pushbuf_space(push, 32, 1, 0);
+   PUSH_REFN(push, screen->text, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD);
    BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
    PUSH_DATA (push, desc_gpuaddr >> 8);
    BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
@@ -747,6 +788,8 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
    PUSH_DATA (push, 0);
 
+   nvc0_update_compute_invocations_counter(nvc0, info);
+
 out:
    if (ret)
       NOUVEAU_ERR("Failed to launch grid !\n");
@@ -835,7 +878,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
 }
 
 
-#ifdef DEBUG
+#ifndef NDEBUG
 static const char *nve4_cache_split_name(unsigned value)
 {
    switch (value) {