From 1906d2b46b21a8e7496409e0639d8463ad86dcfe Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Wed, 27 Jun 2012 14:45:17 +0200 Subject: [PATCH] nv50: dynamically allocate space for shader local storage Fixes 21 piglit tests: spec/glsl-1.10/execution/variable-indexing/ fs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-row-wr spec/glsl-1.20/execution/variable-indexing/ fs-temp-array-mat3-index-col-row-rd fs-temp-array-mat3-index-row-rd fs-temp-array-mat4-col-row-wr fs-temp-array-mat4-index-col-row-rd fs-temp-array-mat4-index-col-row-wr fs-temp-array-mat4-index-row-rd fs-temp-array-mat4-index-row-wr vs-temp-array-mat3-index-col-row-rd vs-temp-array-mat3-index-col-row-wr vs-temp-array-mat3-index-row-rd vs-temp-array-mat3-index-row-wr vs-temp-array-mat4-col-row-wr vs-temp-array-mat4-index-col-row-rd vs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-col-wr vs-temp-array-mat4-index-row-rd vs-temp-array-mat4-index-row-wr vs-temp-array-mat4-index-wr ... and prevents a lot of GPU lockups --- src/gallium/drivers/nv50/nv50_context.h | 1 + src/gallium/drivers/nv50/nv50_program.c | 7 ++ src/gallium/drivers/nv50/nv50_program.h | 4 +- src/gallium/drivers/nv50/nv50_screen.c | 107 +++++++++++++++---- src/gallium/drivers/nv50/nv50_screen.h | 7 +- src/gallium/drivers/nv50/nv50_shader_state.c | 7 +- 6 files changed, 108 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h index 5da0473f0e6..818c3bb15fe 100644 --- a/src/gallium/drivers/nv50/nv50_context.h +++ b/src/gallium/drivers/nv50/nv50_context.h @@ -97,6 +97,7 @@ struct nv50_context { boolean flushed; boolean rasterizer_discard; uint8_t tls_required; + boolean new_tls_space; uint8_t num_vtxbufs; uint8_t num_vtxelts; uint8_t num_textures[3]; diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 9c5a080f6d6..72d14a6e3a0 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -350,6 +350,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset) prog->code_size = info->bin.codeSize; prog->fixups = info->bin.relocData; prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1); + prog->tls_space = info->bin.tlsSpace; if (prog->type == PIPE_SHADER_FRAGMENT) { if (info->prop.fp.writesDepth) { @@ -399,6 +400,12 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) } prog->code_base = prog->mem->start; + ret = nv50_tls_realloc(nv50->screen, prog->tls_space); + if (ret < 0) + return FALSE; + if (ret > 0) + nv50->state.new_tls_space = TRUE; + if (prog->fixups) nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0); diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index f56268b5439..24eaebfb473 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -28,8 +28,6 @@ struct nv50_context; #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" -#define NV50_CAP_MAX_PROGRAM_TEMPS 64 - struct nv50_varying { uint8_t id; /* tgsi index */ uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ @@ -56,7 +54,6 @@ struct nv50_program { ubyte type; boolean translated; - boolean uses_lmem; uint32_t *code; unsigned code_size; @@ -64,6 +61,7 @@ struct nv50_program { uint32_t *immd; unsigned immd_size; unsigned parm_size; /* size limit of uniform buffer */ + uint32_t tls_space; /* required local memory per thread */ ubyte max_gpr; /* REG_ALLOC_TEMP */ ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */ diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index 873946f520d..8c30c96aa73 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -28,11 +28,21 @@ #include "nv50_screen.h" #include "nouveau/nv_object.xml.h" +#include #ifndef NOUVEAU_GETPARAM_GRAPH_UNITS # define NOUVEAU_GETPARAM_GRAPH_UNITS 13 #endif +/* affected by LOCAL_WARPS_LOG_ALLOC / LOCAL_WARPS_NO_CLAMP */ +#define LOCAL_WARPS_ALLOC 32 +/* affected by STACK_WARPS_LOG_ALLOC / STACK_WARPS_NO_CLAMP */ +#define STACK_WARPS_ALLOC 32 + +#define THREADS_IN_WARP 32 + +#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float)) + static boolean nv50_screen_is_format_supported(struct pipe_screen *pscreen, enum pipe_format format, @@ -209,7 +219,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_MAX_PREDS: return 0; case PIPE_SHADER_CAP_MAX_TEMPS: - return NV50_CAP_MAX_PROGRAM_TEMPS; + return nv50_screen(pscreen)->max_tls_space / ONE_TEMP_SIZE; case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_SHADER_CAP_SUBROUTINES: @@ -311,7 +321,7 @@ nv50_screen_fence_update(struct pipe_screen *pscreen) } static void -nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space) +nv50_screen_init_hwctx(struct nv50_screen *screen) { struct nouveau_pushbuf *push = screen->base.pushbuf; struct nv04_fifo *fifo; @@ -411,7 +421,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space) BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->tls_bo->offset); PUSH_DATA (push, screen->tls_bo->offset); - PUSH_DATA (push, util_logbase2(tls_space / 8)); + PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8)); BEGIN_NV04(push, NV50_3D(STACK_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->stack_bo->offset); @@ -508,6 +518,60 @@ nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space) PUSH_KICK (push); } +static int nv50_tls_alloc(struct nv50_screen *screen, unsigned tls_space, + uint64_t *tls_size) +{ + struct nouveau_device *dev = screen->base.device; + int ret; + + screen->cur_tls_space = util_next_power_of_two(tls_space / ONE_TEMP_SIZE) * + ONE_TEMP_SIZE; + if (nouveau_mesa_debug) + debug_printf("allocating space for %u temps\n", + util_next_power_of_two(tls_space / ONE_TEMP_SIZE)); + *tls_size = screen->cur_tls_space * util_next_power_of_two(screen->TPs) * + screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP; + + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, + *tls_size, NULL, &screen->tls_bo); + if (ret) { + NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret); + return ret; + } + + return 0; +} + +int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space) +{ + struct nouveau_pushbuf *push = screen->base.pushbuf; + int ret; + uint64_t tls_size; + + if (tls_space < screen->cur_tls_space) + return 0; + if (tls_space > screen->max_tls_space) { + /* fixable by limiting number of warps (LOCAL_WARPS_LOG_ALLOC / + * LOCAL_WARPS_NO_CLAMP) */ + NOUVEAU_ERR("Unsupported number of temporaries (%u > %u). Fixable if someone cares.\n", + (unsigned)(tls_space / ONE_TEMP_SIZE), + (unsigned)(screen->max_tls_space / ONE_TEMP_SIZE)); + return -ENOMEM; + } + + nouveau_bo_ref(NULL, &screen->tls_bo); + ret = nv50_tls_alloc(screen, tls_space, &tls_size); + if (ret) + return ret; + + BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3); + PUSH_DATAh(push, screen->tls_bo->offset); + PUSH_DATA (push, screen->tls_bo->offset); + PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8)); + + return 1; +} + struct pipe_screen * nv50_screen_create(struct nouveau_device *dev) { @@ -516,7 +580,7 @@ nv50_screen_create(struct nouveau_device *dev) struct nouveau_object *chan; uint64_t value; uint32_t tesla_class; - unsigned stack_size, max_warps, tls_space; + unsigned stack_size; int ret; screen = CALLOC_STRUCT(nv50_screen); @@ -637,10 +701,11 @@ nv50_screen_create(struct nouveau_device *dev) nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); - max_warps = util_bitcount(value & 0xffff); - max_warps *= util_bitcount((value >> 24) & 0xf) * 32; + screen->TPs = util_bitcount(value & 0xffff); + screen->MPsInTP = util_bitcount((value >> 24) & 0xf); - stack_size = max_warps * 64 * 8; + stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP * + STACK_WARPS_ALLOC * 64 * 8; ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, stack_size, NULL, &screen->stack_bo); @@ -649,20 +714,24 @@ nv50_screen_create(struct nouveau_device *dev) goto fail; } - tls_space = NV50_CAP_MAX_PROGRAM_TEMPS * 16; + uint64_t size_of_one_temp = util_next_power_of_two(screen->TPs) * + screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP * + ONE_TEMP_SIZE; + screen->max_tls_space = dev->vram_size / size_of_one_temp * ONE_TEMP_SIZE; + screen->max_tls_space /= 2; /* half of vram */ - screen->tls_size = tls_space * max_warps * 32; + /* hw can address max 64 KiB */ + screen->max_tls_space = MIN2(screen->max_tls_space, 64 << 10); - if (nouveau_mesa_debug) - debug_printf("max_warps = %i, tls_size = %"PRIu64" KiB\n", - max_warps, screen->tls_size >> 10); - - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, screen->tls_size, NULL, - &screen->tls_bo); - if (ret) { - NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret); + uint64_t tls_size; + unsigned tls_space = 4/*temps*/ * ONE_TEMP_SIZE; + ret = nv50_tls_alloc(screen, tls_space, &tls_size); + if (ret) goto fail; - } + + if (nouveau_mesa_debug) + debug_printf("TPs = %u, MPsInTP = %u, VRAM = %"PRIu64" MiB, tls_size = %"PRIu64" KiB\n", + screen->TPs, screen->MPsInTP, dev->vram_size >> 20, tls_size >> 10); ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, 4 << 16, NULL, &screen->uniforms); @@ -684,7 +753,7 @@ nv50_screen_create(struct nouveau_device *dev) if (!nv50_blitctx_create(screen)) goto fail; - nv50_screen_init_hwctx(screen, tls_space); + nv50_screen_init_hwctx(screen); nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE); diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h index 4292f7fc9a6..3ecf20c1e03 100644 --- a/src/gallium/drivers/nv50/nv50_screen.h +++ b/src/gallium/drivers/nv50/nv50_screen.h @@ -34,7 +34,10 @@ struct nv50_screen { struct nouveau_bo *stack_bo; struct nouveau_bo *tls_bo; - uint64_t tls_size; + unsigned TPs; + unsigned MPsInTP; + unsigned max_tls_space; + unsigned cur_tls_space; struct nouveau_heap *vp_code_heap; struct nouveau_heap *gp_code_heap; @@ -143,4 +146,6 @@ nv50_screen_tsc_free(struct nv50_screen *screen, struct nv50_tsc_entry *tsc) } } +extern int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space); + #endif diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c index 586eefe50da..7f052437357 100644 --- a/src/gallium/drivers/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -129,9 +129,12 @@ nv50_program_update_context_state(struct nv50_context *nv50, { const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR; - if (prog && prog->uses_lmem) { - if (!nv50->state.tls_required) + if (prog && prog->tls_space) { + if (nv50->state.new_tls_space) + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS); + if (!nv50->state.tls_required || nv50->state.new_tls_space) BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo); + nv50->state.new_tls_space = FALSE; nv50->state.tls_required |= 1 << stage; } else { if (nv50->state.tls_required == (1 << stage)) -- 2.30.2