nv50/ir,nvc0: use constant buffers for compute when possible on Kepler+
authorRhys Perry <pendingchaos02@gmail.com>
Fri, 3 Aug 2018 21:11:28 +0000 (22:11 +0100)
committerRhys Perry <pendingchaos02@gmail.com>
Mon, 27 Aug 2018 13:23:42 +0000 (14:23 +0100)
Gives a +7.79% increase in FPS with Hitman on lowest quality settings on
my GTX 1060.

total instructions in shared programs : 5787979 -> 5748677 (-0.68%)
total gprs used in shared programs    : 669901 -> 669373 (-0.08%)
total shared used in shared programs  : 548832 -> 548832 (0.00%)
total local used in shared programs   : 21068 -> 21064 (-0.02%)

                local     shared        gpr       inst      bytes
    helped           1           0         152         274         274
      hurt           0           0           0           0           0

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Karol Herbst <kherbst@redhat.com>
src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
src/gallium/drivers/nouveau/nvc0/nve4_compute.c

index 87ded68e5a73ce0a444244381741e443773ca0dd..176e0cf608d38a5f9c500354f05dcd15bb719780 100644 (file)
@@ -2464,18 +2464,16 @@ NVC0LoweringPass::handleLDST(Instruction *i)
          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
       }
    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
+      int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
+      Value *ind = i->getIndirect(0, 1);
+
       if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
-          prog->getType() == Program::TYPE_COMPUTE) {
+          prog->getType() == Program::TYPE_COMPUTE &&
+          (fileIndex >= 6 || ind)) {
          // The launch descriptor only allows to set up 8 CBs, but OpenGL
-         // requires at least 12 UBOs. To bypass this limitation, we store the
-         // addrs into the driver constbuf and we directly load from the global
-         // memory.
-         int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
-         Value *ind = i->getIndirect(0, 1);
-
-         if (!ind && fileIndex == -1)
-            return;
-
+         // requires at least 12 UBOs. To bypass this limitation, for constant
+         // buffers 7+, we store the addrs into the driver constbuf and we
+         // directly load from the global memory.
          if (ind) {
             // Clamp the UBO index when an indirect access is used to avoid
             // loading information from the wrong place in the driver cb.
index 28460f8cbeb875d196e1cb1334a93fc7e9ca9a32..8aa8d4936f32f53c0fe67717776aeab9e9dbbbfa 100644 (file)
@@ -551,6 +551,30 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
    return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
 }
 
+static void
+nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
+{
+   // only user constant buffers 1-6 can be put in the descriptor, the rest are
+   // loaded through global memory
+   for (int i = 1; i <= 6; i++) {
+      if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
+         continue;
+
+      struct nv04_resource *res =
+         nv04_resource(nvc0->constbuf[5][i].u.buf);
+
+      uint32_t base = res->offset + nvc0->constbuf[5][i].offset;
+      uint32_t size = nvc0->constbuf[5][i].size;
+      if (gp100)
+         gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
+      else
+         nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
+   }
+
+   // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
+   // nve4_compute_upload_input() does it later
+}
+
 static void
 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
                                struct nve4_cp_launch_desc *desc,
@@ -588,6 +612,8 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
    }
    nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
                               NVC0_CB_AUX_INFO(5), 1 << 11);
+
+   nve4_compute_setup_buf_cb(nvc0, false, desc);
 }
 
 static void
@@ -626,6 +652,8 @@ gp100_compute_setup_launch_desc(struct nvc0_context *nvc0,
    }
    gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
                                NVC0_CB_AUX_INFO(5), 1 << 11);
+
+   nve4_compute_setup_buf_cb(nvc0, true, desc);
 }
 
 static inline void *