From 1906d2b46b21a8e7496409e0639d8463ad86dcfe Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Wed, 27 Jun 2012 14:45:17 +0200
Subject: [PATCH] nv50: dynamically allocate space for shader local storage

Fixes 21 piglit tests:
spec/glsl-1.10/execution/variable-indexing/
fs-temp-array-mat4-index-col-row-wr
vs-temp-array-mat4-index-col-row-wr
vs-temp-array-mat4-index-row-wr

spec/glsl-1.20/execution/variable-indexing/
fs-temp-array-mat3-index-col-row-rd
fs-temp-array-mat3-index-row-rd
fs-temp-array-mat4-col-row-wr
fs-temp-array-mat4-index-col-row-rd
fs-temp-array-mat4-index-col-row-wr
fs-temp-array-mat4-index-row-rd
fs-temp-array-mat4-index-row-wr
vs-temp-array-mat3-index-col-row-rd
vs-temp-array-mat3-index-col-row-wr
vs-temp-array-mat3-index-row-rd
vs-temp-array-mat3-index-row-wr
vs-temp-array-mat4-col-row-wr
vs-temp-array-mat4-index-col-row-rd
vs-temp-array-mat4-index-col-row-wr
vs-temp-array-mat4-index-col-wr
vs-temp-array-mat4-index-row-rd
vs-temp-array-mat4-index-row-wr
vs-temp-array-mat4-index-wr

... and prevents a lot of GPU lockups
---
 src/gallium/drivers/nv50/nv50_context.h      |   1 +
 src/gallium/drivers/nv50/nv50_program.c      |   7 ++
 src/gallium/drivers/nv50/nv50_program.h      |   4 +-
 src/gallium/drivers/nv50/nv50_screen.c       | 107 +++++++++++++++----
 src/gallium/drivers/nv50/nv50_screen.h       |   7 +-
 src/gallium/drivers/nv50/nv50_shader_state.c |   7 +-
 6 files changed, 108 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 5da0473f0e6..818c3bb15fe 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -97,6 +97,7 @@ struct nv50_context {
       boolean flushed;
       boolean rasterizer_discard;
       uint8_t tls_required;
+      boolean new_tls_space;
       uint8_t num_vtxbufs;
       uint8_t num_vtxelts;
       uint8_t num_textures[3];
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 9c5a080f6d6..72d14a6e3a0 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -350,6 +350,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
    prog->code_size = info->bin.codeSize;
    prog->fixups = info->bin.relocData;
    prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
+   prog->tls_space = info->bin.tlsSpace;
 
    if (prog->type == PIPE_SHADER_FRAGMENT) {
       if (info->prop.fp.writesDepth) {
@@ -399,6 +400,12 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    }
    prog->code_base = prog->mem->start;
 
+   ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
+   if (ret < 0)
+      return FALSE;
+   if (ret > 0)
+      nv50->state.new_tls_space = TRUE;
+
    if (prog->fixups)
       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
 
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index f56268b5439..24eaebfb473 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -28,8 +28,6 @@ struct nv50_context;
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 
-#define NV50_CAP_MAX_PROGRAM_TEMPS 64
-
 struct nv50_varying {
    uint8_t id; /* tgsi index */
    uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
@@ -56,7 +54,6 @@ struct nv50_program {
 
    ubyte type;
    boolean translated;
-   boolean uses_lmem;
 
    uint32_t *code;
    unsigned code_size;
@@ -64,6 +61,7 @@ struct nv50_program {
    uint32_t *immd;
    unsigned immd_size;
    unsigned parm_size; /* size limit of uniform buffer */
+   uint32_t tls_space; /* required local memory per thread */
 
    ubyte max_gpr; /* REG_ALLOC_TEMP */
    ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 873946f520d..8c30c96aa73 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -28,11 +28,21 @@
 #include "nv50_screen.h"
 
 #include "nouveau/nv_object.xml.h"
+#include <errno.h>
 
 #ifndef NOUVEAU_GETPARAM_GRAPH_UNITS
 # define NOUVEAU_GETPARAM_GRAPH_UNITS 13
 #endif
 
+/* affected by LOCAL_WARPS_LOG_ALLOC / LOCAL_WARPS_NO_CLAMP */
+#define LOCAL_WARPS_ALLOC 32
+/* affected by STACK_WARPS_LOG_ALLOC / STACK_WARPS_NO_CLAMP */
+#define STACK_WARPS_ALLOC 32
+
+#define THREADS_IN_WARP 32
+
+#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
+
 static boolean
 nv50_screen_is_format_supported(struct pipe_screen *pscreen,
                                 enum pipe_format format,
@@ -209,7 +219,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_MAX_PREDS:
       return 0;
    case PIPE_SHADER_CAP_MAX_TEMPS:
-      return NV50_CAP_MAX_PROGRAM_TEMPS;
+      return nv50_screen(pscreen)->max_tls_space / ONE_TEMP_SIZE;
    case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
       return 1;
    case PIPE_SHADER_CAP_SUBROUTINES:
@@ -311,7 +321,7 @@ nv50_screen_fence_update(struct pipe_screen *pscreen)
 }
 
 static void
-nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
+nv50_screen_init_hwctx(struct nv50_screen *screen)
 {
    struct nouveau_pushbuf *push = screen->base.pushbuf;
    struct nv04_fifo *fifo;
@@ -411,7 +421,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
    BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3);
    PUSH_DATAh(push, screen->tls_bo->offset);
    PUSH_DATA (push, screen->tls_bo->offset);
-   PUSH_DATA (push, util_logbase2(tls_space / 8));
+   PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8));
 
    BEGIN_NV04(push, NV50_3D(STACK_ADDRESS_HIGH), 3);
    PUSH_DATAh(push, screen->stack_bo->offset);
@@ -508,6 +518,60 @@ nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
    PUSH_KICK (push);
 }
 
+static int nv50_tls_alloc(struct nv50_screen *screen, unsigned tls_space,
+      uint64_t *tls_size)
+{
+   struct nouveau_device *dev = screen->base.device;
+   int ret;
+
+   screen->cur_tls_space = util_next_power_of_two(tls_space / ONE_TEMP_SIZE) *
+         ONE_TEMP_SIZE;
+   if (nouveau_mesa_debug)
+      debug_printf("allocating space for %u temps\n",
+            util_next_power_of_two(tls_space / ONE_TEMP_SIZE));
+   *tls_size = screen->cur_tls_space * util_next_power_of_two(screen->TPs) *
+         screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP;
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+                        *tls_size, NULL, &screen->tls_bo);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret);
+      return ret;
+   }
+
+   return 0;
+}
+
+int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space)
+{
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+   int ret;
+   uint64_t tls_size;
+
+   if (tls_space < screen->cur_tls_space)
+      return 0;
+   if (tls_space > screen->max_tls_space) {
+      /* fixable by limiting number of warps (LOCAL_WARPS_LOG_ALLOC /
+       * LOCAL_WARPS_NO_CLAMP) */
+      NOUVEAU_ERR("Unsupported number of temporaries (%u > %u). Fixable if someone cares.\n",
+            (unsigned)(tls_space / ONE_TEMP_SIZE),
+            (unsigned)(screen->max_tls_space / ONE_TEMP_SIZE));
+      return -ENOMEM;
+   }
+
+   nouveau_bo_ref(NULL, &screen->tls_bo);
+   ret = nv50_tls_alloc(screen, tls_space, &tls_size);
+   if (ret)
+      return ret;
+
+   BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->tls_bo->offset);
+   PUSH_DATA (push, screen->tls_bo->offset);
+   PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8));
+
+   return 1;
+}
+
 struct pipe_screen *
 nv50_screen_create(struct nouveau_device *dev)
 {
@@ -516,7 +580,7 @@ nv50_screen_create(struct nouveau_device *dev)
    struct nouveau_object *chan;
    uint64_t value;
    uint32_t tesla_class;
-   unsigned stack_size, max_warps, tls_space;
+   unsigned stack_size;
    int ret;
 
    screen = CALLOC_STRUCT(nv50_screen);
@@ -637,10 +701,11 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
 
-   max_warps  = util_bitcount(value & 0xffff);
-   max_warps *= util_bitcount((value >> 24) & 0xf) * 32;
+   screen->TPs = util_bitcount(value & 0xffff);
+   screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
 
-   stack_size = max_warps * 64 * 8;
+   stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP *
+         STACK_WARPS_ALLOC * 64 * 8;
 
    ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, stack_size, NULL,
                         &screen->stack_bo);
@@ -649,20 +714,24 @@ nv50_screen_create(struct nouveau_device *dev)
       goto fail;
    }
 
-   tls_space = NV50_CAP_MAX_PROGRAM_TEMPS * 16;
+   uint64_t size_of_one_temp = util_next_power_of_two(screen->TPs) *
+         screen->MPsInTP * LOCAL_WARPS_ALLOC *  THREADS_IN_WARP *
+         ONE_TEMP_SIZE;
+   screen->max_tls_space = dev->vram_size / size_of_one_temp * ONE_TEMP_SIZE;
+   screen->max_tls_space /= 2; /* half of vram */
 
-   screen->tls_size = tls_space * max_warps * 32;
+   /* hw can address max 64 KiB */
+   screen->max_tls_space = MIN2(screen->max_tls_space, 64 << 10);
 
-   if (nouveau_mesa_debug)
-      debug_printf("max_warps = %i, tls_size = %"PRIu64" KiB\n",
-                     max_warps, screen->tls_size >> 10);
-
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, screen->tls_size, NULL,
-                        &screen->tls_bo);
-   if (ret) {
-      NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret);
+   uint64_t tls_size;
+   unsigned tls_space = 4/*temps*/ * ONE_TEMP_SIZE;
+   ret = nv50_tls_alloc(screen, tls_space, &tls_size);
+   if (ret)
       goto fail;
-   }
+
+   if (nouveau_mesa_debug)
+      debug_printf("TPs = %u, MPsInTP = %u, VRAM = %"PRIu64" MiB, tls_size = %"PRIu64" KiB\n",
+            screen->TPs, screen->MPsInTP, dev->vram_size >> 20, tls_size >> 10);
 
    ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, 4 << 16, NULL,
                         &screen->uniforms);
@@ -684,7 +753,7 @@ nv50_screen_create(struct nouveau_device *dev)
    if (!nv50_blitctx_create(screen))
       goto fail;
 
-   nv50_screen_init_hwctx(screen, tls_space);
+   nv50_screen_init_hwctx(screen);
 
    nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
 
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index 4292f7fc9a6..3ecf20c1e03 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -34,7 +34,10 @@ struct nv50_screen {
    struct nouveau_bo *stack_bo;
    struct nouveau_bo *tls_bo;
 
-   uint64_t tls_size;
+   unsigned TPs;
+   unsigned MPsInTP;
+   unsigned max_tls_space;
+   unsigned cur_tls_space;
 
    struct nouveau_heap *vp_code_heap;
    struct nouveau_heap *gp_code_heap;
@@ -143,4 +146,6 @@ nv50_screen_tsc_free(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
    }
 }
 
+extern int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space);
+
 #endif
diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index 586eefe50da..7f052437357 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -129,9 +129,12 @@ nv50_program_update_context_state(struct nv50_context *nv50,
 {
    const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
 
-   if (prog && prog->uses_lmem) {
-      if (!nv50->state.tls_required)
+   if (prog && prog->tls_space) {
+      if (nv50->state.new_tls_space)
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+      if (!nv50->state.tls_required || nv50->state.new_tls_space)
          BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
+      nv50->state.new_tls_space = FALSE;
       nv50->state.tls_required |= 1 << stage;
    } else {
       if (nv50->state.tls_required == (1 << stage))
-- 
2.30.2