From: Christoph Bumiller Date: Sat, 23 Feb 2013 18:40:23 +0000 (+0100) Subject: nvc0: implement compute support for nve4 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=e066f2f62f6043d43385bcdce4e7fa07ffa3ecbe;p=mesa.git nvc0: implement compute support for nve4 --- diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h index 1de3fa65f5d..d5bc8171bb5 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.h +++ b/src/gallium/drivers/nouveau/nouveau_screen.h @@ -5,6 +5,7 @@ #include "util/u_memory.h" typedef uint32_t u32; +typedef uint16_t u16; extern int nouveau_mesa_debug; diff --git a/src/gallium/drivers/nouveau/nv_object.xml.h b/src/gallium/drivers/nouveau/nv_object.xml.h index 66ba61b4622..2fd52acf36a 100644 --- a/src/gallium/drivers/nouveau/nv_object.xml.h +++ b/src/gallium/drivers/nouveau/nv_object.xml.h @@ -196,6 +196,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NVC0_COMPUTE_CLASS 0x000090c0 #define NVC8_COMPUTE_CLASS 0x000092c0 #define NVE4_COMPUTE_CLASS 0x0000a0c0 +#define NVF0_COMPUTE_CLASS 0x0000a1c0 #define NV84_CRYPT_CLASS 0x000074c1 #define BLOB_NVC0_PCOPY1_CLASS 0x000090b8 #define BLOB_NVC0_PCOPY0_CLASS 0x000090b5 diff --git a/src/gallium/drivers/nv50/nv50_defs.xml.h b/src/gallium/drivers/nv50/nv50_defs.xml.h index 27046e9e564..2e42843fa56 100644 --- a/src/gallium/drivers/nv50/nv50_defs.xml.h +++ b/src/gallium/drivers/nv50/nv50_defs.xml.h @@ -1,5 +1,5 @@ -#ifndef RNNDB_NV50_DEFS_XML -#define RNNDB_NV50_DEFS_XML +#ifndef NV50_DEFS_XML +#define NV50_DEFS_XML /* Autogenerated file, DO NOT EDIT manually! @@ -8,11 +8,11 @@ http://0x04.net/cgit/index.cgi/rules-ng-ng git clone git://0x04.net/rules-ng-ng The rules-ng-ng source files this header was generated from are: -- rnndb/nv50_defs.xml ( 5468 bytes, from 2011-07-09 13:43:58) -- ./rnndb/copyright.xml ( 6452 bytes, from 2011-07-09 13:43:58) -- ./rnndb/nvchipsets.xml ( 3617 bytes, from 2011-07-09 13:43:58) +- rnndb/nv50_defs.xml ( 7783 bytes, from 2013-02-14 13:56:25) +- ./rnndb/copyright.xml ( 6452 bytes, from 2011-08-11 18:25:12) +- ./rnndb/nvchipsets.xml ( 3704 bytes, from 2012-08-18 12:48:55) -Copyright (C) 2006-2011 by the following authors: +Copyright (C) 2006-2013 by the following authors: - Artur Huillet (ahuillet) - Ben Skeggs (darktama, darktama_) - B. R. (koala_br) @@ -71,6 +71,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define NV50_VSTATUS_IDLE 0x00000000 +#define NV50_VSTATUS_BUSY 0x00000001 +#define NV50_VSTATUS_UNK2 0x00000002 +#define NV50_VSTATUS_WAITING 0x00000003 +#define NV50_VSTATUS_BLOCKED 0x00000005 +#define NV50_VSTATUS_FAULTED 0x00000006 +#define NV50_VSTATUS_PAUSED 0x00000007 #define NV50_SURFACE_FORMAT_BITMAP 0x0000001c #define NV50_SURFACE_FORMAT_UNK1D 0x0000001d #define NV50_SURFACE_FORMAT_RGBA32_FLOAT 0x000000c0 @@ -143,6 +150,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50_ZETA_FORMAT_Z24_X8_S8_C8_X16_UNORM 0x0000001d #define NV50_ZETA_FORMAT_Z32_X8_C8_X16_FLOAT 0x0000001e #define NV50_ZETA_FORMAT_Z32_S8_C8_X16_FLOAT 0x0000001f +#define NVE4_IMAGE_FORMAT_RGBA32_FLOAT 0x00000002 +#define NVE4_IMAGE_FORMAT_RGBA32_SINT 0x00000003 +#define NVE4_IMAGE_FORMAT_RGBA32_UINT 0x00000004 +#define NVE4_IMAGE_FORMAT_RGBA16_UNORM 0x00000008 +#define NVE4_IMAGE_FORMAT_RGBA16_SNORM 0x00000009 +#define NVE4_IMAGE_FORMAT_RGBA16_SINT 0x0000000a +#define NVE4_IMAGE_FORMAT_RGBA16_UINT 0x0000000b +#define NVE4_IMAGE_FORMAT_RGBA16_FLOAT 0x0000000c +#define NVE4_IMAGE_FORMAT_RG32_FLOAT 0x0000000d +#define NVE4_IMAGE_FORMAT_RG32_SINT 0x0000000e +#define NVE4_IMAGE_FORMAT_RG32_UINT 0x0000000f +#define NVE4_IMAGE_FORMAT_RGB10_A2_UNORM 0x00000013 +#define NVE4_IMAGE_FORMAT_RGB10_A2_UINT 0x00000015 +#define NVE4_IMAGE_FORMAT_RGBA8_UNORM 0x00000018 +#define NVE4_IMAGE_FORMAT_RGBA8_SNORM 0x0000001a +#define NVE4_IMAGE_FORMAT_RGBA8_SINT 0x0000001b +#define NVE4_IMAGE_FORMAT_RGBA8_UINT 0x0000001c +#define NVE4_IMAGE_FORMAT_RG16_UNORM 0x0000001d +#define NVE4_IMAGE_FORMAT_RG16_SNORM 0x0000001e +#define NVE4_IMAGE_FORMAT_RG16_SINT 0x0000001f +#define NVE4_IMAGE_FORMAT_RG16_UINT 0x00000020 +#define NVE4_IMAGE_FORMAT_RG16_FLOAT 0x00000021 +#define NVE4_IMAGE_FORMAT_R11G11B10_FLOAT 0x00000024 +#define NVE4_IMAGE_FORMAT_R32_SINT 0x00000027 +#define NVE4_IMAGE_FORMAT_R32_UINT 0x00000028 +#define NVE4_IMAGE_FORMAT_R32_FLOAT 0x00000029 +#define NVE4_IMAGE_FORMAT_RG8_UNORM 0x0000002e +#define NVE4_IMAGE_FORMAT_RG8_SNORM 0x0000002f +#define NVE4_IMAGE_FORMAT_RG8_SINT 0x00000030 +#define NVE4_IMAGE_FORMAT_RG8_UINT 0x00000031 +#define NVE4_IMAGE_FORMAT_R16_UNORM 0x00000032 +#define NVE4_IMAGE_FORMAT_R16_SNORM 0x00000033 +#define NVE4_IMAGE_FORMAT_R16_SINT 0x00000034 +#define NVE4_IMAGE_FORMAT_R16_UINT 0x00000035 +#define NVE4_IMAGE_FORMAT_R16_FLOAT 0x00000036 +#define NVE4_IMAGE_FORMAT_R8_UNORM 0x00000037 +#define NVE4_IMAGE_FORMAT_R8_SNORM 0x00000038 +#define NVE4_IMAGE_FORMAT_R8_SINT 0x00000039 +#define NVE4_IMAGE_FORMAT_R8_UINT 0x0000003a #define NV50_QUERY__SIZE 0x00000010 #define NV50_QUERY_COUNTER 0x00000000 @@ -151,4 +197,4 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50_QUERY_TIME 0x00000008 -#endif /* RNNDB_NV50_DEFS_XML */ +#endif /* NV50_DEFS_XML */ diff --git a/src/gallium/drivers/nvc0/Makefile.sources b/src/gallium/drivers/nvc0/Makefile.sources index 33b90f290fb..db8d12347b0 100644 --- a/src/gallium/drivers/nvc0/Makefile.sources +++ b/src/gallium/drivers/nvc0/Makefile.sources @@ -14,6 +14,7 @@ C_SOURCES := \ nvc0_program.c \ nvc0_shader_state.c \ nvc0_query.c \ + nve4_compute.c \ nvc0_video.c \ nvc0_video_bsp.c \ nvc0_video_vp.c \ diff --git a/src/gallium/drivers/nvc0/nvc0_context.c b/src/gallium/drivers/nvc0/nvc0_context.c index 75bd1551b8f..dc0c4b922db 100644 --- a/src/gallium/drivers/nvc0/nvc0_context.c +++ b/src/gallium/drivers/nvc0/nvc0_context.c @@ -63,6 +63,7 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0) nouveau_bufctx_del(&nvc0->bufctx_3d); nouveau_bufctx_del(&nvc0->bufctx); + nouveau_bufctx_del(&nvc0->bufctx_cp); util_unreference_framebuffer_state(&nvc0->framebuffer); @@ -71,7 +72,7 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0) pipe_resource_reference(&nvc0->idxbuf.buffer, NULL); - for (s = 0; s < 5; ++s) { + for (s = 0; s < 6; ++s) { for (i = 0; i < nvc0->num_textures[s]; ++i) pipe_sampler_view_reference(&nvc0->textures[s][i], NULL); @@ -80,8 +81,21 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0) pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, NULL); } + for (s = 0; s < 2; ++s) { + for (i = 0; i < NVC0_MAX_SURFACE_SLOTS; ++i) + pipe_surface_reference(&nvc0->surfaces[s][i], NULL); + } + for (i = 0; i < nvc0->num_tfbbufs; ++i) pipe_so_target_reference(&nvc0->tfbbuf[i], NULL); + + for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *); + ++i) { + struct pipe_resource **res = util_dynarray_element( + &nvc0->global_residents, struct pipe_resource *, i); + pipe_resource_reference(res, NULL); + } + util_dynarray_fini(&nvc0->global_residents); } static void @@ -219,10 +233,13 @@ nvc0_create(struct pipe_screen *pscreen, void *priv) nvc0->base.pushbuf = screen->base.pushbuf; nvc0->base.client = screen->base.client; - ret = nouveau_bufctx_new(screen->base.client, NVC0_BIND_COUNT, - &nvc0->bufctx_3d); + ret = nouveau_bufctx_new(screen->base.client, 2, &nvc0->bufctx); + if (!ret) + ret = nouveau_bufctx_new(screen->base.client, NVC0_BIND_3D_COUNT, + &nvc0->bufctx_3d); if (!ret) - nouveau_bufctx_new(screen->base.client, 2, &nvc0->bufctx); + ret = nouveau_bufctx_new(screen->base.client, NVC0_BIND_CP_COUNT, + &nvc0->bufctx_cp); if (ret) goto out_err; @@ -236,6 +253,8 @@ nvc0_create(struct pipe_screen *pscreen, void *priv) pipe->draw_vbo = nvc0_draw_vbo; pipe->clear = nvc0_clear; + if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) + pipe->launch_grid = nve4_launch_grid; pipe->flush = nvc0_flush; pipe->texture_barrier = nvc0_texture_barrier; @@ -274,23 +293,39 @@ nvc0_create(struct pipe_screen *pscreen, void *priv) BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->text); BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->uniform_bo); BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->txc); + if (screen->compute) { + BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->text); + BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->txc); + BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->parm); + } + + flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR; + BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->poly_cache); + if (screen->compute) + BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->tls); flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR; BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->fence.bo); BCTX_REFN_bo(nvc0->bufctx, FENCE, flags, screen->fence.bo); + if (screen->compute) + BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->fence.bo); nvc0->base.scratch.bo_size = 2 << 20; memset(nvc0->tex_handles, ~0, sizeof(nvc0->tex_handles)); + util_dynarray_init(&nvc0->global_residents); + return pipe; out_err: if (nvc0) { if (nvc0->bufctx_3d) nouveau_bufctx_del(&nvc0->bufctx_3d); + if (nvc0->bufctx_cp) + nouveau_bufctx_del(&nvc0->bufctx_cp); if (nvc0->bufctx) nouveau_bufctx_del(&nvc0->bufctx); if (nvc0->blit) diff --git a/src/gallium/drivers/nvc0/nvc0_context.h b/src/gallium/drivers/nvc0/nvc0_context.h index f5b0b6b849e..d9aa3788cb0 100644 --- a/src/gallium/drivers/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nvc0/nvc0_context.h @@ -55,7 +55,16 @@ #define NVC0_NEW_SAMPLERS (1 << 20) #define NVC0_NEW_TFB_TARGETS (1 << 21) #define NVC0_NEW_IDXBUF (1 << 22) +#define NVC0_NEW_SURFACES (1 << 23) +#define NVC0_NEW_CP_PROGRAM (1 << 0) +#define NVC0_NEW_CP_SURFACES (1 << 1) +#define NVC0_NEW_CP_TEXTURES (1 << 2) +#define NVC0_NEW_CP_SAMPLERS (1 << 3) +#define NVC0_NEW_CP_CONSTBUF (1 << 4) +#define NVC0_NEW_CP_GLOBALS (1 << 5) + +/* 3d bufctx (during draw_vbo, blit_3d) */ #define NVC0_BIND_FB 0 #define NVC0_BIND_VTX 1 #define NVC0_BIND_VTX_TMP 2 @@ -63,10 +72,21 @@ #define NVC0_BIND_TEX(s, i) ( 4 + 32 * (s) + (i)) #define NVC0_BIND_CB(s, i) (164 + 16 * (s) + (i)) #define NVC0_BIND_TFB 244 -#define NVC0_BIND_SCREEN 245 -#define NVC0_BIND_TLS 246 -#define NVC0_BIND_COUNT 247 - +#define NVC0_BIND_SUF 245 +#define NVC0_BIND_SCREEN 246 +#define NVC0_BIND_TLS 247 +#define NVC0_BIND_3D_COUNT 248 + +/* compute bufctx (during launch_grid) */ +#define NVC0_BIND_CP_CB(i) ( 0 + (i)) +#define NVC0_BIND_CP_TEX(i) ( 16 + (i)) +#define NVC0_BIND_CP_SUF 48 +#define NVC0_BIND_CP_GLOBAL 49 +#define NVC0_BIND_CP_DESC 50 +#define NVC0_BIND_CP_SCREEN 51 +#define NVC0_BIND_CP_COUNT 52 + +/* bufctx for other operations */ #define NVC0_BIND_2D 0 #define NVC0_BIND_M2MF 0 #define NVC0_BIND_FENCE 1 @@ -81,6 +101,7 @@ struct nvc0_context { struct nouveau_bufctx *bufctx_3d; struct nouveau_bufctx *bufctx; + struct nouveau_bufctx *bufctx_cp; struct nvc0_screen *screen; @@ -90,6 +111,7 @@ struct nvc0_context { uint32_t nblocksx, uint32_t nblocksy); uint32_t dirty; + uint32_t dirty_cp; /* dirty flags for compute state */ struct { boolean flushed; @@ -105,8 +127,8 @@ struct nvc0_context { uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */ uint8_t num_vtxbufs; uint8_t num_vtxelts; - uint8_t num_textures[5]; - uint8_t num_samplers[5]; + uint8_t num_textures[6]; + uint8_t num_samplers[6]; uint8_t tls_required; /* bitmask of shader types using l[] */ uint8_t c14_bound; /* whether immediate array constbuf is bound */ uint8_t clip_enable; @@ -125,9 +147,10 @@ struct nvc0_context { struct nvc0_program *tevlprog; struct nvc0_program *gmtyprog; struct nvc0_program *fragprog; + struct nvc0_program *compprog; - struct nvc0_constbuf constbuf[5][NVC0_MAX_PIPE_CONSTBUFS]; - uint16_t constbuf_dirty[5]; + struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS]; + uint16_t constbuf_dirty[6]; struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS]; unsigned num_vtxbufs; @@ -139,14 +162,14 @@ struct nvc0_context { uint32_t instance_off; /* current base vertex for instanced arrays */ uint32_t instance_max; /* last instance for current draw call */ - struct pipe_sampler_view *textures[5][PIPE_MAX_SAMPLERS]; - unsigned num_textures[5]; - uint32_t textures_dirty[5]; - struct nv50_tsc_entry *samplers[5][PIPE_MAX_SAMPLERS]; - unsigned num_samplers[5]; - uint16_t samplers_dirty[5]; + struct pipe_sampler_view *textures[6][PIPE_MAX_SAMPLERS]; + unsigned num_textures[6]; + uint32_t textures_dirty[6]; + struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS]; + unsigned num_samplers[6]; + uint16_t samplers_dirty[6]; - uint32_t tex_handles[5][PIPE_MAX_SAMPLERS]; /* for nve4 */ + uint32_t tex_handles[6][PIPE_MAX_SAMPLERS]; /* for nve4 */ struct pipe_framebuffer_state framebuffer; struct pipe_blend_color blend_colour; @@ -169,6 +192,12 @@ struct nvc0_context { struct nvc0_blitctx *blit; + struct pipe_surface *surfaces[2][NVC0_MAX_SURFACE_SLOTS]; + uint16_t surfaces_dirty[2]; + uint16_t surfaces_valid[2]; + + struct util_dynarray global_residents; + #ifdef NVC0_WITH_DRAW_MODULE struct draw_context *draw; #endif @@ -211,6 +240,8 @@ boolean nvc0_program_translate(struct nvc0_program *, uint16_t chipset); boolean nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *); void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *); void nvc0_program_library_upload(struct nvc0_context *); +uint32_t nvc0_program_symbol_offset(const struct nvc0_program *, + uint32_t label); /* nvc0_query.c */ void nvc0_init_query_functions(struct nvc0_context *); @@ -236,6 +267,8 @@ void nvc0_tfb_validate(struct nvc0_context *); extern void nvc0_init_state_functions(struct nvc0_context *); /* nvc0_state_validate.c */ +void nvc0_validate_global_residents(struct nvc0_context *, + struct nouveau_bufctx *, int bin); extern boolean nvc0_state_validate(struct nvc0_context *, uint32_t state_mask, unsigned space_words); @@ -246,9 +279,13 @@ extern void nvc0_clear(struct pipe_context *, unsigned buffers, extern void nvc0_init_surface_functions(struct nvc0_context *); /* nvc0_tex.c */ +boolean nve4_validate_tsc(struct nvc0_context *nvc0, int s); void nvc0_validate_textures(struct nvc0_context *); void nvc0_validate_samplers(struct nvc0_context *); void nve4_set_tex_handles(struct nvc0_context *); +void nvc0_validate_surfaces(struct nvc0_context *); +void nve4_set_surface_info(struct nouveau_pushbuf *, struct pipe_surface *, + struct nvc0_screen *); struct pipe_sampler_view * nvc0_create_texture_view(struct pipe_context *, @@ -315,4 +352,8 @@ nvc0_screen_get_video_param(struct pipe_screen *pscreen, /* nvc0_push.c */ void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *); +/* nve4_compute.c */ +void nve4_launch_grid(struct pipe_context *, + const uint *, const uint *, uint32_t, const void *); + #endif diff --git a/src/gallium/drivers/nvc0/nvc0_program.c b/src/gallium/drivers/nvc0/nvc0_program.c index e4ac8ba8e2c..592d338f446 100644 --- a/src/gallium/drivers/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nvc0/nvc0_program.c @@ -25,6 +25,7 @@ #include "nvc0_context.h" #include "nv50/codegen/nv50_ir_driver.h" +#include "nve4_compute.h" /* If only they told use the actual semantic instead of just GENERIC ... */ static void @@ -533,10 +534,11 @@ nvc0_program_dump(struct nvc0_program *prog) { unsigned pos; - for (pos = 0; pos < sizeof(prog->hdr) / sizeof(prog->hdr[0]); ++pos) - debug_printf("HDR[%02lx] = 0x%08x\n", - pos * sizeof(prog->hdr[0]), prog->hdr[pos]); - + if (prog->type != PIPE_SHADER_COMPUTE) { + for (pos = 0; pos < sizeof(prog->hdr) / sizeof(prog->hdr[0]); ++pos) + debug_printf("HDR[%02lx] = 0x%08x\n", + pos * sizeof(prog->hdr[0]), prog->hdr[pos]); + } debug_printf("shader binary code (0x%x bytes):", prog->code_size); for (pos = 0; pos < prog->code_size / 4; ++pos) { if ((pos % 8) == 0) @@ -569,11 +571,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) if (prog->type == PIPE_SHADER_COMPUTE) { if (chipset >= NVISA_GK104_CHIPSET) { info->io.resInfoCBSlot = 0; - info->io.texBindBase = 0; /* TODO */ - info->io.suInfoBase = 0; /* TODO */ + info->io.texBindBase = NVE4_CP_INPUT_TEX(0); + info->io.suInfoBase = NVE4_CP_INPUT_SUF(0); } info->io.msInfoCBSlot = 0; - info->io.msInfoBase = 0; /* TODO */ + info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS; } else { if (chipset >= NVISA_GK104_CHIPSET) { info->io.resInfoCBSlot = 15; @@ -598,14 +600,16 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) NOUVEAU_ERR("shader translation failed: %i\n", ret); goto out; } - FREE(info->bin.syms); + if (prog->type != PIPE_SHADER_COMPUTE) + FREE(info->bin.syms); prog->code = info->bin.code; prog->code_size = info->bin.codeSize; prog->immd_data = info->immd.buf; prog->immd_size = info->immd.bufSize; prog->relocs = info->bin.relocData; - prog->max_gpr = MAX2(4, (info->bin.maxGPR + 1)); + prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1)); + prog->num_barriers = info->numBarriers; prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS; @@ -633,6 +637,10 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) case PIPE_SHADER_FRAGMENT: ret = nvc0_fp_gen_header(prog, info); break; + case PIPE_SHADER_COMPUTE: + prog->cp.syms = info->bin.syms; + prog->cp.num_syms = info->bin.numSyms; + break; default: ret = -1; NOUVEAU_ERR("unknown program type: %u\n", prog->type); @@ -672,8 +680,9 @@ boolean nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) { struct nvc0_screen *screen = nvc0->screen; + const boolean is_cp = prog->type == PIPE_SHADER_COMPUTE; int ret; - uint32_t size = prog->code_size + NVC0_SHADER_HEADER_SIZE; + uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); uint32_t lib_pos = screen->lib_code->start; uint32_t code_pos; @@ -689,7 +698,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) * latency information is expected only at certain positions. */ if (screen->base.class_3d >= NVE4_3D_CLASS) - size = size + 0x70; + size = size + (is_cp ? 0x40 : 0x70); size = align(size, 0x40); ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem); @@ -714,18 +723,27 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) assert((prog->immd_size == 0) || (prog->immd_base + prog->immd_size <= prog->mem->start + prog->mem->size)); - if (screen->base.class_3d >= NVE4_3D_CLASS) { - switch (prog->mem->start & 0xff) { - case 0x40: prog->code_base += 0x70; break; - case 0x80: prog->code_base += 0x30; break; - case 0xc0: prog->code_base += 0x70; break; - default: - prog->code_base += 0x30; - assert((prog->mem->start & 0xff) == 0x00); - break; + if (!is_cp) { + if (screen->base.class_3d >= NVE4_3D_CLASS) { + switch (prog->mem->start & 0xff) { + case 0x40: prog->code_base += 0x70; break; + case 0x80: prog->code_base += 0x30; break; + case 0xc0: prog->code_base += 0x70; break; + default: + prog->code_base += 0x30; + assert((prog->mem->start & 0xff) == 0x00); + break; + } + } + code_pos = prog->code_base + NVC0_SHADER_HEADER_SIZE; + } else { + if (screen->base.class_3d >= NVE4_3D_CLASS) { + if (prog->mem->start & 0x40) + prog->code_base += 0x40; + assert((prog->code_base & 0x7f) == 0x00); } + code_pos = prog->code_base; } - code_pos = prog->code_base + NVC0_SHADER_HEADER_SIZE; if (prog->relocs) nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0); @@ -735,10 +753,10 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) nvc0_program_dump(prog); #endif - nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base, - NOUVEAU_BO_VRAM, NVC0_SHADER_HEADER_SIZE, prog->hdr); - nvc0->base.push_data(&nvc0->base, screen->text, - prog->code_base + NVC0_SHADER_HEADER_SIZE, + if (!is_cp) + nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base, + NOUVEAU_BO_VRAM, NVC0_SHADER_HEADER_SIZE, prog->hdr); + nvc0->base.push_data(&nvc0->base, screen->text, code_pos, NOUVEAU_BO_VRAM, prog->code_size, prog->code); if (prog->immd_size) nvc0->base.push_data(&nvc0->base, @@ -790,6 +808,8 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog) FREE(prog->code); FREE(prog->immd_data); FREE(prog->relocs); + if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms) + FREE(prog->cp.syms); if (prog->tfb) { if (nvc0->state.tfb == prog->tfb) nvc0->state.tfb = NULL; @@ -801,3 +821,18 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog) prog->pipe = pipe; prog->type = type; } + +uint32_t +nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label) +{ + const struct nv50_ir_prog_symbol *syms = + (const struct nv50_ir_prog_symbol *)prog->cp.syms; + unsigned base = 0; + unsigned i; + if (prog->type != PIPE_SHADER_COMPUTE) + base = NVC0_SHADER_HEADER_SIZE; + for (i = 0; i < prog->cp.num_syms; ++i) + if (syms[i].label == label) + return prog->code_base + base + syms[i].offset; + return ~0; +} diff --git a/src/gallium/drivers/nvc0/nvc0_program.h b/src/gallium/drivers/nvc0/nvc0_program.h index f6d1121c6dd..9c184d1f1d5 100644 --- a/src/gallium/drivers/nvc0/nvc0_program.h +++ b/src/gallium/drivers/nvc0/nvc0_program.h @@ -22,7 +22,7 @@ struct nvc0_program { ubyte type; boolean translated; boolean need_tls; - uint8_t max_gpr; + uint8_t num_gprs; uint32_t *code; uint32_t *immd_data; @@ -50,6 +50,13 @@ struct nvc0_program { uint32_t tess_mode; /* ~0 if defined by the other stage */ uint32_t input_patch_size; } tp; + struct { + uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */ + uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */ + void *syms; + unsigned num_syms; + } cp; + uint8_t num_barriers; void *relocs; diff --git a/src/gallium/drivers/nvc0/nvc0_screen.c b/src/gallium/drivers/nvc0/nvc0_screen.c index 077f89efef9..7d034797eef 100644 --- a/src/gallium/drivers/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nvc0/nvc0_screen.c @@ -88,12 +88,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) switch (param) { case PIPE_CAP_MAX_COMBINED_SAMPLERS: - return 16 * PIPE_SHADER_TYPES; /* NOTE: should not count COMPUTE */ + return 16 * 5; case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: return 15; case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: - return 12; + return (class_3d >= NVE4_3D_CLASS) ? 13 : 12; case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: return 2048; case PIPE_CAP_MIN_TEXEL_OFFSET: @@ -176,6 +176,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: case PIPE_CAP_TEXTURE_MULTISAMPLE: return 0; + case PIPE_CAP_COMPUTE: + return (class_3d >= NVE4_3D_CLASS) ? 1 : 0; default: NOUVEAU_ERR("unknown PIPE_CAP %d\n", param); return 0; @@ -186,6 +188,8 @@ static int nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, enum pipe_shader_cap param) { + const uint16_t class_3d = nouveau_screen(pscreen)->class_3d; + switch (shader) { case PIPE_SHADER_VERTEX: /* @@ -195,11 +199,17 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_GEOMETRY: case PIPE_SHADER_FRAGMENT: break; + case PIPE_SHADER_COMPUTE: + if (class_3d < NVE4_3D_CLASS) + return 0; + break; default: return 0; } switch (param) { + case PIPE_SHADER_CAP_PREFERRED_IR: + return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: @@ -216,6 +226,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_MAX_CONSTS: return 65536 / 16; case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS) + return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE; return NVC0_MAX_PIPE_CONSTBUFS; case PIPE_SHADER_CAP_MAX_ADDRS: return 1; @@ -234,7 +246,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: return 0; case PIPE_SHADER_CAP_SUBROUTINES: - return 1; /* but inlining everything, we need function declarations */ + return 1; case PIPE_SHADER_CAP_INTEGERS: return 1; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: @@ -270,6 +282,47 @@ nvc0_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) } } +static int +nvc0_screen_get_compute_param(struct pipe_screen *pscreen, + enum pipe_compute_cap param, void *data) +{ + uint64_t *data64 = (uint64_t *)data; + const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass; + + switch (param) { + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + data64[0] = 3; + return 8; + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + data64[0] = (obj_class >= NVE4_COMPUTE_CLASS) ? 0x7fffffff : 65535; + data64[1] = 65535; + data64[2] = 65535; + return 24; + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + data64[0] = 1024; + data64[1] = 1024; + data64[2] = 64; + return 24; + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + data64[0] = 1024; + return 8; + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g[] */ + data64[0] = (uint64_t)1 << 40; + return 8; + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */ + data64[0] = 48 << 10; + return 8; + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */ + data64[0] = 512 << 10; + return 8; + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */ + data64[0] = 4096; + return 8; + default: + return 0; + } +} + static void nvc0_screen_destroy(struct pipe_screen *pscreen) { @@ -291,6 +344,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen) nouveau_bo_ref(NULL, &screen->txc); nouveau_bo_ref(NULL, &screen->fence.bo); nouveau_bo_ref(NULL, &screen->poly_cache); + nouveau_bo_ref(NULL, &screen->parm); nouveau_heap_destroy(&screen->lib_code); nouveau_heap_destroy(&screen->text_heap); @@ -412,6 +466,23 @@ nvc0_screen_fence_update(struct pipe_screen *pscreen) return screen->fence.map[0]; } +static int +nvc0_screen_init_compute(struct nvc0_screen *screen) +{ + screen->base.base.get_compute_param = nvc0_screen_get_compute_param; + + switch (screen->base.device->chipset & 0xf0) { + case 0xc0: + case 0xd0: + return 0; + case 0xe0: + case 0xf0: + return nve4_screen_compute_setup(screen, screen->base.pushbuf); + default: + return -1; + } +} + #define FAIL_SCREEN_INIT(str, err) \ do { \ NOUVEAU_ERR(str, err); \ @@ -653,9 +724,9 @@ nvc0_screen_create(struct nouveau_device *dev) /* max MPs * max warps per MP (TODO: ask kernel) */ if (screen->eng3d->oclass >= NVE4_3D_CLASS) - screen->tls_size = 8 * 64; + screen->tls_size = 8 * 64 * 32; else - screen->tls_size = 16 * 48; + screen->tls_size = 16 * 48 * 32; screen->tls_size *= NVC0_CAP_MAX_PROGRAM_TEMPS * 16; screen->tls_size = align(screen->tls_size, 1 << 17); @@ -775,6 +846,9 @@ nvc0_screen_create(struct nouveau_device *dev) IMMED_NVC0(push, NVC0_3D(EDGEFLAG), 1); + if (nvc0_screen_init_compute(screen)) + goto fail; + PUSH_KICK (push); screen->tic.entries = CALLOC(4096, sizeof(void *)); diff --git a/src/gallium/drivers/nvc0/nvc0_screen.h b/src/gallium/drivers/nvc0/nvc0_screen.h index 2adcfeac3ef..16f0febd3ea 100644 --- a/src/gallium/drivers/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nvc0/nvc0_screen.h @@ -15,7 +15,10 @@ #define NVC0_TSC_MAX_ENTRIES 2048 /* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */ -#define NVC0_MAX_PIPE_CONSTBUFS 14 +#define NVC0_MAX_PIPE_CONSTBUFS 14 +#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE 7 + +#define NVC0_MAX_SURFACE_SLOTS 16 struct nvc0_context; @@ -29,7 +32,8 @@ struct nvc0_screen { int num_occlusion_queries_active; struct nouveau_bo *text; - struct nouveau_bo *uniform_bo; + struct nouveau_bo *parm; /* for COMPUTE */ + struct nouveau_bo *uniform_bo; /* for 3D */ struct nouveau_bo *tls; struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */ struct nouveau_bo *poly_cache; @@ -63,7 +67,7 @@ struct nvc0_screen { struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */ struct nouveau_object *eng2d; struct nouveau_object *m2mf; - struct nouveau_object *dijkstra; + struct nouveau_object *compute; }; static INLINE struct nvc0_screen * @@ -80,6 +84,8 @@ void nvc0_screen_make_buffers_resident(struct nvc0_screen *); int nvc0_screen_tic_alloc(struct nvc0_screen *, void *); int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *); +int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *); + static INLINE void nvc0_resource_fence(struct nv04_resource *res, uint32_t flags) { diff --git a/src/gallium/drivers/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nvc0/nvc0_shader_state.c index 786889f8b57..5cd6a84be57 100644 --- a/src/gallium/drivers/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nvc0/nvc0_shader_state.c @@ -95,7 +95,7 @@ nvc0_vertprog_validate(struct nvc0_context *nvc0) PUSH_DATA (push, 0x11); PUSH_DATA (push, vp->code_base); BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(1)), 1); - PUSH_DATA (push, vp->max_gpr); + PUSH_DATA (push, vp->num_gprs); // BEGIN_NVC0(push, NVC0_3D_(0x163c), 1); // PUSH_DATA (push, 0); @@ -120,7 +120,7 @@ nvc0_fragprog_validate(struct nvc0_context *nvc0) PUSH_DATA (push, 0x51); PUSH_DATA (push, fp->code_base); BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(5)), 1); - PUSH_DATA (push, fp->max_gpr); + PUSH_DATA (push, fp->num_gprs); BEGIN_NVC0(push, SUBC_3D(0x0360), 2); PUSH_DATA (push, 0x20164010); @@ -144,7 +144,7 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0) PUSH_DATA (push, 0x21); PUSH_DATA (push, tp->code_base); BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1); - PUSH_DATA (push, tp->max_gpr); + PUSH_DATA (push, tp->num_gprs); if (tp->tp.input_patch_size <= 32) IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), tp->tp.input_patch_size); @@ -171,7 +171,7 @@ nvc0_tevlprog_validate(struct nvc0_context *nvc0) BEGIN_NVC0(push, NVC0_3D(SP_START_ID(3)), 1); PUSH_DATA (push, tp->code_base); BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(3)), 1); - PUSH_DATA (push, tp->max_gpr); + PUSH_DATA (push, tp->num_gprs); } else { BEGIN_NVC0(push, NVC0_3D(MACRO_TEP_SELECT), 1); PUSH_DATA (push, 0x30); @@ -197,7 +197,7 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0) BEGIN_NVC0(push, NVC0_3D(SP_START_ID(4)), 1); PUSH_DATA (push, gp->code_base); BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(4)), 1); - PUSH_DATA (push, gp->max_gpr); + PUSH_DATA (push, gp->num_gprs); BEGIN_NVC0(push, NVC0_3D(LAYER), 1); PUSH_DATA (push, gp_selects_layer ? NVC0_3D_LAYER_USE_GP : 0); } else { diff --git a/src/gallium/drivers/nvc0/nvc0_state.c b/src/gallium/drivers/nvc0/nvc0_state.c index 30011df4dc1..cba076fb982 100644 --- a/src/gallium/drivers/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nvc0/nvc0_state.c @@ -489,6 +489,57 @@ nvc0_gp_sampler_states_bind(struct pipe_context *pipe, unsigned nr, void **s) nvc0_stage_sampler_states_bind(nvc0_context(pipe), 3, nr, s); } +static void +nvc0_stage_sampler_states_bind_range(struct nvc0_context *nvc0, + const unsigned s, + unsigned start, unsigned nr, void **cso) +{ + const unsigned end = start + nr; + int last_valid = -1; + unsigned i; + + if (cso) { + for (i = start; i < end; ++i) { + const unsigned p = i - start; + if (cso[p]) + last_valid = i; + if (cso[p] == nvc0->samplers[s][i]) + continue; + nvc0->samplers_dirty[s] |= 1 << i; + + if (nvc0->samplers[s][i]) + nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]); + nvc0->samplers[s][i] = cso[p]; + } + } else { + for (i = start; i < end; ++i) { + if (nvc0->samplers[s][i]) { + nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]); + nvc0->samplers[s][i] = NULL; + nvc0->samplers_dirty[s] |= 1 << i; + } + } + } + + if (nvc0->num_samplers[s] <= end) { + if (last_valid < 0) { + for (i = start; i && !nvc0->samplers[s][i - 1]; --i); + nvc0->num_samplers[s] = i; + } else { + nvc0->num_samplers[s] = last_valid + 1; + } + } +} + +static void +nvc0_cp_sampler_states_bind(struct pipe_context *pipe, + unsigned start, unsigned nr, void **cso) +{ + nvc0_stage_sampler_states_bind_range(nvc0_context(pipe), 5, start, nr, cso); + + nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SAMPLERS; +} + /* NOTE: only called when not referenced anywhere, won't be bound */ static void nvc0_sampler_view_destroy(struct pipe_context *pipe, @@ -561,6 +612,67 @@ nvc0_gp_set_sampler_views(struct pipe_context *pipe, nvc0_stage_set_sampler_views(nvc0_context(pipe), 3, nr, views); } +static void +nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s, + unsigned start, unsigned nr, + struct pipe_sampler_view **views) +{ + struct nouveau_bufctx *bctx = (s == 5) ? nvc0->bufctx_cp : nvc0->bufctx_3d; + const unsigned end = start + nr; + const unsigned bin = (s == 5) ? NVC0_BIND_CP_TEX(0) : NVC0_BIND_TEX(s, 0); + int last_valid = -1; + unsigned i; + + if (views) { + for (i = start; i < end; ++i) { + const unsigned p = i - start; + if (views[p]) + last_valid = i; + if (views[p] == nvc0->textures[s][i]) + continue; + nvc0->textures_dirty[s] |= 1 << i; + + if (nvc0->textures[s][i]) { + struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]); + nouveau_bufctx_reset(bctx, bin + i); + nvc0_screen_tic_unlock(nvc0->screen, old); + } + pipe_sampler_view_reference(&nvc0->textures[s][i], views[p]); + } + } else { + for (i = start; i < end; ++i) { + struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]); + if (!old) + continue; + nvc0->textures_dirty[s] |= 1 << i; + + nvc0_screen_tic_unlock(nvc0->screen, old); + pipe_sampler_view_reference(&nvc0->textures[s][i], NULL); + nouveau_bufctx_reset(bctx, bin + i); + } + } + + if (nvc0->num_textures[s] <= end) { + if (last_valid < 0) { + for (i = start; i && !nvc0->textures[s][i - 1]; --i); + nvc0->num_textures[s] = i; + } else { + nvc0->num_textures[s] = last_valid + 1; + } + } +} + +static void +nvc0_cp_set_sampler_views(struct pipe_context *pipe, + unsigned start, unsigned nr, + struct pipe_sampler_view **views) +{ + nvc0_stage_set_sampler_views_range(nvc0_context(pipe), 5, start, nr, views); + + nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_TEXTURES; +} + + /* ============================= SHADERS ======================================= */ @@ -644,6 +756,35 @@ nvc0_gp_state_bind(struct pipe_context *pipe, void *hwcso) nvc0->dirty |= NVC0_NEW_GMTYPROG; } +static void * +nvc0_cp_state_create(struct pipe_context *pipe, + const struct pipe_compute_state *cso) +{ + struct nvc0_program *prog; + + prog = CALLOC_STRUCT(nvc0_program); + if (!prog) + return NULL; + prog->type = PIPE_SHADER_COMPUTE; + + prog->cp.smem_size = cso->req_local_mem; + prog->cp.lmem_size = cso->req_private_mem; + prog->parm_size = cso->req_input_mem; + + prog->pipe.tokens = tgsi_dup_tokens((const struct tgsi_token *)cso->prog); + + return (void *)prog; +} + +static void +nvc0_cp_state_bind(struct pipe_context *pipe, void *hwcso) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + + nvc0->compprog = hwcso; + nvc0->dirty_cp |= NVC0_NEW_CP_PROGRAM; +} + static void nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, struct pipe_constant_buffer *cb) @@ -653,14 +794,22 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, const unsigned s = nvc0_shader_stage(shader); const unsigned i = index; - if (shader == PIPE_SHADER_COMPUTE) - return; + if (unlikely(shader == PIPE_SHADER_COMPUTE)) { + assert(!cb || !cb->user_buffer); + if (nvc0->constbuf[s][i].u.buf) + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_CB(i)); - if (nvc0->constbuf[s][i].user) - nvc0->constbuf[s][i].u.buf = NULL; - else - if (nvc0->constbuf[s][i].u.buf) - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CB(s, i)); + nvc0->dirty_cp |= NVC0_NEW_CP_CONSTBUF; + } else { + if (nvc0->constbuf[s][i].user) + nvc0->constbuf[s][i].u.buf = NULL; + else + if (nvc0->constbuf[s][i].u.buf) + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CB(s, i)); + + nvc0->dirty |= NVC0_NEW_CONSTBUF; + } + nvc0->constbuf_dirty[s] |= 1 << i; pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, res); @@ -673,10 +822,6 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, nvc0->constbuf[s][i].offset = cb->buffer_offset; nvc0->constbuf[s][i].size = align(cb->buffer_size, 0x100); } - - nvc0->constbuf_dirty[s] |= 1 << i; - - nvc0->dirty |= NVC0_NEW_CONSTBUF; } /* ============================================================================= @@ -919,6 +1064,113 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe, nvc0->dirty |= NVC0_NEW_TFB_TARGETS; } +static void +nvc0_bind_surfaces_range(struct nvc0_context *nvc0, const unsigned t, + unsigned start, unsigned nr, + struct pipe_surface **psurfaces) +{ + const unsigned end = start + nr; + const unsigned mask = ((1 << nr) - 1) << start; + unsigned i; + + if (psurfaces) { + for (i = start; i < end; ++i) { + const unsigned p = i - start; + if (psurfaces[p]) + nvc0->surfaces_valid[t] |= (1 << i); + else + nvc0->surfaces_valid[t] &= ~(1 << i); + pipe_surface_reference(&nvc0->surfaces[t][i], psurfaces[p]); + } + } else { + for (i = start; i < end; ++i) + pipe_surface_reference(&nvc0->surfaces[t][i], NULL); + nvc0->surfaces_valid[t] &= ~mask; + } + nvc0->surfaces_dirty[t] |= mask; + + if (t == 0) + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_SUF); + else + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF); +} + +static void +nvc0_set_compute_resources(struct pipe_context *pipe, + unsigned start, unsigned nr, + struct pipe_surface **resources) +{ + nvc0_bind_surfaces_range(nvc0_context(pipe), 1, start, nr, resources); + + nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SURFACES; +} + +static void +nvc0_set_shader_resources(struct pipe_context *pipe, + unsigned start, unsigned nr, + struct pipe_surface **resources) +{ + nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, resources); + + nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES; +} + +static INLINE void +nvc0_set_global_handle(uint32_t *phandle, struct pipe_resource *res) +{ + struct nv04_resource *buf = nv04_resource(res); + if (buf) { + uint64_t limit = (buf->address + buf->base.width0) - 1; + if (limit < (1ULL << 32)) { + *phandle = (uint32_t)buf->address; + } else { + NOUVEAU_ERR("Cannot map into TGSI_RESOURCE_GLOBAL: " + "resource not contained within 32-bit address space !\n"); + *phandle = 0; + } + } else { + *phandle = 0; + } +} + +static void +nvc0_set_global_bindings(struct pipe_context *pipe, + unsigned start, unsigned nr, + struct pipe_resource **resources, + uint32_t **handles) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + struct pipe_resource **ptr; + unsigned i; + const unsigned end = start + nr; + + if (nvc0->global_residents.size <= (end * sizeof(struct pipe_resource *))) { + const unsigned old_size = nvc0->global_residents.size; + const unsigned req_size = end * sizeof(struct pipe_resource *); + util_dynarray_resize(&nvc0->global_residents, req_size); + memset((uint8_t *)nvc0->global_residents.data + old_size, 0, + req_size - old_size); + } + + if (resources) { + ptr = util_dynarray_element( + &nvc0->global_residents, struct pipe_resource *, start); + for (i = 0; i < nr; ++i) { + pipe_resource_reference(&ptr[i], resources[i]); + nvc0_set_global_handle(handles[i], resources[i]); + } + } else { + ptr = util_dynarray_element( + &nvc0->global_residents, struct pipe_resource *, start); + for (i = 0; i < nr; ++i) + pipe_resource_reference(&ptr[i], NULL); + } + + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL); + + nvc0->dirty_cp = NVC0_NEW_CP_GLOBALS; +} + void nvc0_init_state_functions(struct nvc0_context *nvc0) { @@ -941,12 +1193,14 @@ nvc0_init_state_functions(struct nvc0_context *nvc0) pipe->bind_vertex_sampler_states = nvc0_vp_sampler_states_bind; pipe->bind_fragment_sampler_states = nvc0_fp_sampler_states_bind; pipe->bind_geometry_sampler_states = nvc0_gp_sampler_states_bind; + pipe->bind_compute_sampler_states = nvc0_cp_sampler_states_bind; pipe->create_sampler_view = nvc0_create_sampler_view; pipe->sampler_view_destroy = nvc0_sampler_view_destroy; pipe->set_vertex_sampler_views = nvc0_vp_set_sampler_views; pipe->set_fragment_sampler_views = nvc0_fp_set_sampler_views; pipe->set_geometry_sampler_views = nvc0_gp_set_sampler_views; + pipe->set_compute_sampler_views = nvc0_cp_set_sampler_views; pipe->create_vs_state = nvc0_vp_state_create; pipe->create_fs_state = nvc0_fp_state_create; @@ -958,6 +1212,10 @@ nvc0_init_state_functions(struct nvc0_context *nvc0) pipe->delete_fs_state = nvc0_sp_state_delete; pipe->delete_gs_state = nvc0_sp_state_delete; + pipe->create_compute_state = nvc0_cp_state_create; + pipe->bind_compute_state = nvc0_cp_state_bind; + pipe->delete_compute_state = nvc0_sp_state_delete; + pipe->set_blend_color = nvc0_set_blend_color; pipe->set_stencil_ref = nvc0_set_stencil_ref; pipe->set_clip_state = nvc0_set_clip_state; @@ -978,5 +1236,9 @@ nvc0_init_state_functions(struct nvc0_context *nvc0) pipe->create_stream_output_target = nvc0_so_target_create; pipe->stream_output_target_destroy = nvc0_so_target_destroy; pipe->set_stream_output_targets = nvc0_set_transform_feedback_targets; + + pipe->set_global_binding = nvc0_set_global_bindings; + pipe->set_compute_resources = nvc0_set_compute_resources; + pipe->set_shader_resources = nvc0_set_shader_resources; } diff --git a/src/gallium/drivers/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nvc0/nvc0_state_validate.c index 80a8c01a51a..1f12de61e9f 100644 --- a/src/gallium/drivers/nvc0/nvc0_state_validate.c +++ b/src/gallium/drivers/nvc0/nvc0_state_validate.c @@ -430,6 +430,21 @@ nvc0_validate_sample_mask(struct nvc0_context *nvc0) PUSH_DATA (push, 0x01); } +void +nvc0_validate_global_residents(struct nvc0_context *nvc0, + struct nouveau_bufctx *bctx, int bin) +{ + unsigned i; + + for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *); + ++i) { + struct pipe_resource *res = *util_dynarray_element( + &nvc0->global_residents, struct pipe_resource *, i); + if (res) + nvc0_add_resident(bctx, bin, nv04_resource(res), NOUVEAU_BO_RDWR); + } +} + static void nvc0_validate_derived_1(struct nvc0_context *nvc0) { @@ -513,6 +528,7 @@ static struct state_validate { { nvc0_validate_samplers, NVC0_NEW_SAMPLERS }, { nve4_set_tex_handles, NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS }, { nvc0_vertex_arrays_validate, NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS }, + { nvc0_validate_surfaces, NVC0_NEW_SURFACES }, { nvc0_idxbuf_validate, NVC0_NEW_IDXBUF }, { nvc0_tfb_validate, NVC0_NEW_TFB_TARGETS | NVC0_NEW_GMTYPROG } }; diff --git a/src/gallium/drivers/nvc0/nvc0_surface.c b/src/gallium/drivers/nvc0/nvc0_surface.c index 281d740b218..77330c52ac5 100644 --- a/src/gallium/drivers/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nvc0/nvc0_surface.c @@ -515,7 +515,7 @@ nvc0_blitter_make_vp(struct nvc0_blitter *blit) blit->vp.code = (uint32_t *)code_nvc0; /* const_cast */ blit->vp.code_size = sizeof(code_nvc0); } - blit->vp.max_gpr = 7; + blit->vp.num_gprs = 7; blit->vp.vp.edgeflag = PIPE_MAX_ATTRIBS; blit->vp.hdr[0] = 0x00020461; /* vertprog magic */ diff --git a/src/gallium/drivers/nvc0/nvc0_tex.c b/src/gallium/drivers/nvc0/nvc0_tex.c index 2bce97b32c7..7fbe1e6736b 100644 --- a/src/gallium/drivers/nvc0/nvc0_tex.c +++ b/src/gallium/drivers/nvc0/nvc0_tex.c @@ -23,6 +23,7 @@ #include "nvc0_context.h" #include "nvc0_resource.h" #include "nv50/nv50_texture.xml.h" +#include "nv50/nv50_defs.xml.h" #include "util/u_format.h" @@ -413,7 +414,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s) return need_flush; } -static boolean +boolean nve4_validate_tsc(struct nvc0_context *nvc0, int s) { struct nouveau_bo *txc = nvc0->screen->txc; @@ -515,3 +516,295 @@ nve4_set_tex_handles(struct nvc0_context *nvc0) nvc0->samplers_dirty[s] = 0; } } + + +static const uint8_t nve4_su_format_map[PIPE_FORMAT_COUNT]; +static const uint16_t nve4_su_format_aux_map[PIPE_FORMAT_COUNT]; +static const uint16_t nve4_suldp_lib_offset[PIPE_FORMAT_COUNT]; + +void +nve4_set_surface_info(struct nouveau_pushbuf *push, + struct pipe_surface *psf, + struct nvc0_screen *screen) +{ + struct nv50_surface *sf = nv50_surface(psf); + struct nv04_resource *res; + uint64_t address; + uint32_t *const info = push->cur; + uint8_t log2cpp; + + if (psf && !nve4_su_format_map[psf->format]) + NOUVEAU_ERR("unsupported surface format, try is_format_supported() !\n"); + + push->cur += 16; + + if (!psf || !nve4_su_format_map[psf->format]) { + memset(info, 0, 16 * sizeof(*info)); + + info[0] = 0xbadf0000; + info[1] = 0x80004000; + info[12] = nve4_suldp_lib_offset[PIPE_FORMAT_R32G32B32A32_UINT] + + screen->lib_code->start; + return; + } + res = nv04_resource(sf->base.texture); + + address = res->address + sf->offset; + + info[8] = sf->width; + info[9] = sf->height; + info[10] = sf->depth; + switch (res->base.target) { + case PIPE_TEXTURE_1D_ARRAY: + info[11] = 1; + break; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + info[11] = 2; + break; + case PIPE_TEXTURE_3D: + info[11] = 3; + break; + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + info[11] = 4; + break; + default: + info[11] = 0; + break; + } + log2cpp = (0xf000 & nve4_su_format_aux_map[sf->base.format]) >> 12; + + info[12] = nve4_suldp_lib_offset[sf->base.format] + screen->lib_code->start; + + /* limit in bytes for raw access */ + info[13] = (0x06 << 22) | ((sf->width << log2cpp) - 1); + + info[1] = nve4_su_format_map[sf->base.format]; + +#if 0 + switch (util_format_get_blocksizebits(sf->base.format)) { + case 16: info[1] |= 1 << 16; break; + case 32: info[1] |= 2 << 16; break; + case 64: info[1] |= 3 << 16; break; + case 128: info[1] |= 4 << 16; break; + default: + break; + } +#else + info[1] |= log2cpp << 16; + info[1] |= 0x4000; + info[1] |= (0x0f00 & nve4_su_format_aux_map[sf->base.format]); +#endif + + if (res->base.target == PIPE_BUFFER) { + info[0] = address >> 8; + info[2] = sf->width - 1; + info[2] |= (0xff & nve4_su_format_aux_map[sf->base.format]) << 22; + info[3] = 0; + info[4] = 0; + info[5] = 0; + info[6] = 0; + info[7] = 0; + info[14] = 0; + info[15] = 0; + } else { + struct nv50_miptree *mt = nv50_miptree(&res->base); + struct nv50_miptree_level *lvl = &mt->level[sf->base.u.tex.level]; + const unsigned z = sf->base.u.tex.first_layer; + + if (z) { + if (mt->layout_3d) { + address += nvc0_mt_zslice_offset(mt, psf->u.tex.level, z); + /* doesn't work if z passes z-tile boundary */ + assert(sf->depth == 1); + } else { + address += mt->layer_stride * z; + } + } + info[0] = address >> 8; + info[2] = sf->width - 1; + /* NOTE: this is really important: */ + info[2] |= (0xff & nve4_su_format_aux_map[sf->base.format]) << 22; + info[3] = (0x88 << 24) | (lvl->pitch / 64); + info[4] = sf->height - 1; + info[4] |= (lvl->tile_mode & 0x0f0) << 25; + info[4] |= NVC0_TILE_SHIFT_Y(lvl->tile_mode) << 22; + info[5] = mt->layer_stride >> 8; + info[6] = sf->depth - 1; + info[6] |= (lvl->tile_mode & 0xf00) << 21; + info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22; + info[7] = 0; + info[14] = mt->ms_x; + info[15] = mt->ms_y; + } +} + +static INLINE void +nvc0_update_surface_bindings(struct nvc0_context *nvc0) +{ + /* TODO */ +} + +static INLINE void +nve4_update_surface_bindings(struct nvc0_context *nvc0) +{ + /* TODO */ +} + +void +nvc0_validate_surfaces(struct nvc0_context *nvc0) +{ + if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) { + nve4_update_surface_bindings(nvc0); + } else { + nvc0_update_surface_bindings(nvc0); + } +} + + +static const uint8_t nve4_su_format_map[PIPE_FORMAT_COUNT] = +{ + [PIPE_FORMAT_R32G32B32A32_FLOAT] = NVE4_IMAGE_FORMAT_RGBA32_FLOAT, + [PIPE_FORMAT_R32G32B32A32_SINT] = NVE4_IMAGE_FORMAT_RGBA32_SINT, + [PIPE_FORMAT_R32G32B32A32_UINT] = NVE4_IMAGE_FORMAT_RGBA32_UINT, + [PIPE_FORMAT_R16G16B16A16_FLOAT] = NVE4_IMAGE_FORMAT_RGBA16_FLOAT, + [PIPE_FORMAT_R16G16B16A16_UNORM] = NVE4_IMAGE_FORMAT_RGBA16_UNORM, + [PIPE_FORMAT_R16G16B16A16_SNORM] = NVE4_IMAGE_FORMAT_RGBA16_SNORM, + [PIPE_FORMAT_R16G16B16A16_SINT] = NVE4_IMAGE_FORMAT_RGBA16_SINT, + [PIPE_FORMAT_R16G16B16A16_UINT] = NVE4_IMAGE_FORMAT_RGBA16_UINT, + [PIPE_FORMAT_R8G8B8A8_UNORM] = NVE4_IMAGE_FORMAT_RGBA8_UNORM, + [PIPE_FORMAT_R8G8B8A8_SNORM] = NVE4_IMAGE_FORMAT_RGBA8_SNORM, + [PIPE_FORMAT_R8G8B8A8_SINT] = NVE4_IMAGE_FORMAT_RGBA8_SINT, + [PIPE_FORMAT_R8G8B8A8_UINT] = NVE4_IMAGE_FORMAT_RGBA8_UINT, + [PIPE_FORMAT_R11G11B10_FLOAT] = NVE4_IMAGE_FORMAT_R11G11B10_FLOAT, + [PIPE_FORMAT_R10G10B10A2_UNORM] = NVE4_IMAGE_FORMAT_RGB10_A2_UNORM, +/* [PIPE_FORMAT_R10G10B10A2_UINT] = NVE4_IMAGE_FORMAT_RGB10_A2_UINT, */ + [PIPE_FORMAT_R32G32_FLOAT] = NVE4_IMAGE_FORMAT_RG32_FLOAT, + [PIPE_FORMAT_R32G32_SINT] = NVE4_IMAGE_FORMAT_RG32_SINT, + [PIPE_FORMAT_R32G32_UINT] = NVE4_IMAGE_FORMAT_RG32_UINT, + [PIPE_FORMAT_R16G16_FLOAT] = NVE4_IMAGE_FORMAT_RG16_FLOAT, + [PIPE_FORMAT_R16G16_UNORM] = NVE4_IMAGE_FORMAT_RG16_UNORM, + [PIPE_FORMAT_R16G16_SNORM] = NVE4_IMAGE_FORMAT_RG16_SNORM, + [PIPE_FORMAT_R16G16_SINT] = NVE4_IMAGE_FORMAT_RG16_SINT, + [PIPE_FORMAT_R16G16_UINT] = NVE4_IMAGE_FORMAT_RG16_UINT, + [PIPE_FORMAT_R8G8_UNORM] = NVE4_IMAGE_FORMAT_RG8_UNORM, + [PIPE_FORMAT_R8G8_SNORM] = NVE4_IMAGE_FORMAT_RG8_SNORM, + [PIPE_FORMAT_R8G8_SINT] = NVE4_IMAGE_FORMAT_RG8_SINT, + [PIPE_FORMAT_R8G8_UINT] = NVE4_IMAGE_FORMAT_RG8_UINT, + [PIPE_FORMAT_R32_FLOAT] = NVE4_IMAGE_FORMAT_R32_FLOAT, + [PIPE_FORMAT_R32_SINT] = NVE4_IMAGE_FORMAT_R32_SINT, + [PIPE_FORMAT_R32_UINT] = NVE4_IMAGE_FORMAT_R32_UINT, + [PIPE_FORMAT_R16_FLOAT] = NVE4_IMAGE_FORMAT_R16_FLOAT, + [PIPE_FORMAT_R16_UNORM] = NVE4_IMAGE_FORMAT_R16_UNORM, + [PIPE_FORMAT_R16_SNORM] = NVE4_IMAGE_FORMAT_R16_SNORM, + [PIPE_FORMAT_R16_SINT] = NVE4_IMAGE_FORMAT_R16_SINT, + [PIPE_FORMAT_R16_UINT] = NVE4_IMAGE_FORMAT_R16_UINT, + [PIPE_FORMAT_R8_UNORM] = NVE4_IMAGE_FORMAT_R8_UNORM, + [PIPE_FORMAT_R8_SNORM] = NVE4_IMAGE_FORMAT_R8_SNORM, + [PIPE_FORMAT_R8_SINT] = NVE4_IMAGE_FORMAT_R8_SINT, + [PIPE_FORMAT_R8_UINT] = NVE4_IMAGE_FORMAT_R8_UINT, +}; + +/* Auxiliary format description values for surface instructions. + * (log2(bytes per pixel) << 12) | (unk8 << 8) | unk22 + */ +static const uint16_t nve4_su_format_aux_map[PIPE_FORMAT_COUNT] = +{ + [PIPE_FORMAT_R32G32B32A32_FLOAT] = 0x4842, + [PIPE_FORMAT_R32G32B32A32_SINT] = 0x4842, + [PIPE_FORMAT_R32G32B32A32_UINT] = 0x4842, + + [PIPE_FORMAT_R16G16B16A16_UNORM] = 0x3933, + [PIPE_FORMAT_R16G16B16A16_SNORM] = 0x3933, + [PIPE_FORMAT_R16G16B16A16_SINT] = 0x3933, + [PIPE_FORMAT_R16G16B16A16_UINT] = 0x3933, + [PIPE_FORMAT_R16G16B16A16_FLOAT] = 0x3933, + + [PIPE_FORMAT_R32G32_FLOAT] = 0x3433, + [PIPE_FORMAT_R32G32_SINT] = 0x3433, + [PIPE_FORMAT_R32G32_UINT] = 0x3433, + + [PIPE_FORMAT_R10G10B10A2_UNORM] = 0x2a24, +/* [PIPE_FORMAT_R10G10B10A2_UINT] = 0x2a24, */ + [PIPE_FORMAT_R8G8B8A8_UNORM] = 0x2a24, + [PIPE_FORMAT_R8G8B8A8_SNORM] = 0x2a24, + [PIPE_FORMAT_R8G8B8A8_SINT] = 0x2a24, + [PIPE_FORMAT_R8G8B8A8_UINT] = 0x2a24, + [PIPE_FORMAT_R11G11B10_FLOAT] = 0x2a24, + + [PIPE_FORMAT_R16G16_UNORM] = 0x2524, + [PIPE_FORMAT_R16G16_SNORM] = 0x2524, + [PIPE_FORMAT_R16G16_SINT] = 0x2524, + [PIPE_FORMAT_R16G16_UINT] = 0x2524, + [PIPE_FORMAT_R16G16_FLOAT] = 0x2524, + + [PIPE_FORMAT_R32_SINT] = 0x2024, + [PIPE_FORMAT_R32_UINT] = 0x2024, + [PIPE_FORMAT_R32_FLOAT] = 0x2024, + + [PIPE_FORMAT_R8G8_UNORM] = 0x1615, + [PIPE_FORMAT_R8G8_SNORM] = 0x1615, + [PIPE_FORMAT_R8G8_SINT] = 0x1615, + [PIPE_FORMAT_R8G8_UINT] = 0x1615, + + [PIPE_FORMAT_R16_UNORM] = 0x1115, + [PIPE_FORMAT_R16_SNORM] = 0x1115, + [PIPE_FORMAT_R16_SINT] = 0x1115, + [PIPE_FORMAT_R16_UINT] = 0x1115, + [PIPE_FORMAT_R16_FLOAT] = 0x1115, + + [PIPE_FORMAT_R8_UNORM] = 0x0206, + [PIPE_FORMAT_R8_SNORM] = 0x0206, + [PIPE_FORMAT_R8_SINT] = 0x0206, + [PIPE_FORMAT_R8_UINT] = 0x0206 +}; + +/* NOTE: These are hardcoded offsets for the shader library. + * TODO: Automate them. + */ +static const uint16_t nve4_suldp_lib_offset[PIPE_FORMAT_COUNT] = +{ + [PIPE_FORMAT_R32G32B32A32_FLOAT] = 0x218, + [PIPE_FORMAT_R32G32B32A32_SINT] = 0x218, + [PIPE_FORMAT_R32G32B32A32_UINT] = 0x218, + [PIPE_FORMAT_R16G16B16A16_UNORM] = 0x248, + [PIPE_FORMAT_R16G16B16A16_SNORM] = 0x2b8, + [PIPE_FORMAT_R16G16B16A16_SINT] = 0x330, + [PIPE_FORMAT_R16G16B16A16_UINT] = 0x388, + [PIPE_FORMAT_R16G16B16A16_FLOAT] = 0x3d8, + [PIPE_FORMAT_R32G32_FLOAT] = 0x428, + [PIPE_FORMAT_R32G32_SINT] = 0x468, + [PIPE_FORMAT_R32G32_UINT] = 0x468, + [PIPE_FORMAT_R10G10B10A2_UNORM] = 0x4a8, +/* [PIPE_FORMAT_R10G10B10A2_UINT] = 0x530, */ + [PIPE_FORMAT_R8G8B8A8_UNORM] = 0x588, + [PIPE_FORMAT_R8G8B8A8_SNORM] = 0x5f8, + [PIPE_FORMAT_R8G8B8A8_SINT] = 0x670, + [PIPE_FORMAT_R8G8B8A8_UINT] = 0x6c8, + [PIPE_FORMAT_B5G6R5_UNORM] = 0x718, + [PIPE_FORMAT_B5G5R5X1_UNORM] = 0x7a0, + [PIPE_FORMAT_R16G16_UNORM] = 0x828, + [PIPE_FORMAT_R16G16_SNORM] = 0x890, + [PIPE_FORMAT_R16G16_SINT] = 0x8f0, + [PIPE_FORMAT_R16G16_UINT] = 0x948, + [PIPE_FORMAT_R16G16_FLOAT] = 0x998, + [PIPE_FORMAT_R32_FLOAT] = 0x9e8, + [PIPE_FORMAT_R32_SINT] = 0xa30, + [PIPE_FORMAT_R32_UINT] = 0xa30, + [PIPE_FORMAT_R8G8_UNORM] = 0xa78, + [PIPE_FORMAT_R8G8_SNORM] = 0xae0, + [PIPE_FORMAT_R8G8_UINT] = 0xb48, + [PIPE_FORMAT_R8G8_SINT] = 0xb98, + [PIPE_FORMAT_R16_UNORM] = 0xbe8, + [PIPE_FORMAT_R16_SNORM] = 0xc48, + [PIPE_FORMAT_R16_SINT] = 0xca0, + [PIPE_FORMAT_R16_UINT] = 0xce8, + [PIPE_FORMAT_R16_FLOAT] = 0xd30, + [PIPE_FORMAT_R8_UNORM] = 0xd88, + [PIPE_FORMAT_R8_SNORM] = 0xde0, + [PIPE_FORMAT_R8_SINT] = 0xe38, + [PIPE_FORMAT_R8_UINT] = 0xe88, + [PIPE_FORMAT_R11G11B10_FLOAT] = 0xed0 +}; diff --git a/src/gallium/drivers/nvc0/nve4_compute.c b/src/gallium/drivers/nvc0/nve4_compute.c new file mode 100644 index 00000000000..e823d210952 --- /dev/null +++ b/src/gallium/drivers/nvc0/nve4_compute.c @@ -0,0 +1,607 @@ +/* + * Copyright 2012 Nouveau Project + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Christoph Bumiller + */ + +#include "nvc0_context.h" +#include "nve4_compute.h" + +#include "nv50/codegen/nv50_ir_driver.h" + +static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *); + + +int +nve4_screen_compute_setup(struct nvc0_screen *screen, + struct nouveau_pushbuf *push) +{ + struct nouveau_device *dev = screen->base.device; + struct nouveau_object *chan = screen->base.channel; + unsigned i; + int ret; + uint32_t obj_class; + + switch (dev->chipset & 0xf0) { + case 0xf0: + obj_class = NVF0_COMPUTE_CLASS; /* GK110 */ + break; + case 0xe0: + obj_class = NVE4_COMPUTE_CLASS; /* GK104 */ + break; + default: + NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset); + break; + } + + ret = nouveau_object_new(chan, 0xbeef00c0, obj_class, NULL, 0, + &screen->compute); + if (ret) { + NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret); + return ret; + } + + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, NVE4_CP_PARAM_SIZE, NULL, + &screen->parm); + if (ret) + return ret; + + BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); + PUSH_DATA (push, screen->compute->oclass); + + BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->tls->offset); + PUSH_DATA (push, screen->tls->offset); + /* No idea why there are 2. Divide size by 2 to be safe. + * Actually this might be per-MP TEMP size and looks like I'm only using + * 2 MPs instead of all 8. + */ + BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_SIZE_HIGH(0)), 3); + PUSH_DATAh(push, screen->tls_size / 2); + PUSH_DATA (push, screen->tls_size / 2); + PUSH_DATA (push, 0xff); + BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_SIZE_HIGH(1)), 3); + PUSH_DATAh(push, screen->tls_size / 2); + PUSH_DATA (push, screen->tls_size / 2); + PUSH_DATA (push, 0xff); + + /* Unified address space ? Who needs that ? Certainly not OpenCL. + * + * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be + * accessible. We cannot prevent that at the moment, so expect failure. + */ + BEGIN_NVC0(push, NVE4_COMPUTE(LOCAL_BASE), 1); + PUSH_DATA (push, 1 << 24); + BEGIN_NVC0(push, NVE4_COMPUTE(SHARED_BASE), 1); + PUSH_DATA (push, 2 << 24); + + BEGIN_NVC0(push, NVE4_COMPUTE(CODE_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->text->offset); + PUSH_DATA (push, screen->text->offset); + + BEGIN_NVC0(push, SUBC_COMPUTE(0x0310), 1); + PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300); + + /* NOTE: these do not affect the state used by the 3D object */ + BEGIN_NVC0(push, NVE4_COMPUTE(TIC_ADDRESS_HIGH), 3); + PUSH_DATAh(push, screen->txc->offset); + PUSH_DATA (push, screen->txc->offset); + PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1); + BEGIN_NVC0(push, NVE4_COMPUTE(TSC_ADDRESS_HIGH), 3); + PUSH_DATAh(push, screen->txc->offset + 65536); + PUSH_DATA (push, screen->txc->offset + 65536); + PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1); + + if (obj_class >= NVF0_COMPUTE_CLASS) { + BEGIN_NVC0(push, SUBC_COMPUTE(0x0248), 1); + PUSH_DATA (push, 0x100); + BEGIN_NIC0(push, SUBC_COMPUTE(0x0248), 63); + for (i = 63; i >= 1; --i) + PUSH_DATA(push, 0x38000 | i); + IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); + IMMED_NVC0(push, SUBC_COMPUTE(0x518), 0); + } + + BEGIN_NVC0(push, NVE4_COMPUTE(TEX_CB_INDEX), 1); + PUSH_DATA (push, 0); /* does not interefere with 3D */ + + if (obj_class >= NVF0_COMPUTE_CLASS) + IMMED_NVC0(push, SUBC_COMPUTE(0x02c4), 1); + + /* MS sample coordinate offsets: these do not work with _ALT modes ! */ + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS); + PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS); + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2); + PUSH_DATA (push, 64); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL); + BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA); + PUSH_DATA (push, 0); /* 0 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 1); /* 1 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); /* 2 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 1); /* 3 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 2); /* 4 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 3); /* 5 */ + PUSH_DATA (push, 0); + PUSH_DATA (push, 2); /* 6 */ + PUSH_DATA (push, 1); + PUSH_DATA (push, 3); /* 7 */ + PUSH_DATA (push, 1); + BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); + + return 0; +} + + +static void +nve4_compute_validate_surfaces(struct nvc0_context *nvc0) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nv50_surface *sf; + struct nv04_resource *res; + uint32_t mask; + unsigned i; + const unsigned t = 1; + + mask = nvc0->surfaces_dirty[t]; + while (mask) { + i = ffs(mask) - 1; + mask &= ~(1 << i); + + /* + * NVE4's surface load/store instructions receive all the information + * directly instead of via binding points, so we have to supply them. + */ + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i)); + PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i)); + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2); + PUSH_DATA (push, 64); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL); + BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA); + + nve4_set_surface_info(push, nvc0->surfaces[t][i], screen); + + sf = nv50_surface(nvc0->surfaces[t][i]); + if (sf) { + res = nv04_resource(sf->base.texture); + + if (sf->base.writable) + BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); + else + BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); + } + } + if (nvc0->surfaces_dirty[t]) { + BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); + } + + /* re-reference non-dirty surfaces */ + mask = nvc0->surfaces_valid[t] & ~nvc0->surfaces_dirty[t]; + while (mask) { + i = ffs(mask) - 1; + mask &= ~(1 << i); + + sf = nv50_surface(nvc0->surfaces[t][i]); + res = nv04_resource(sf->base.texture); + + if (sf->base.writable) + BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); + else + BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); + } + + nvc0->surfaces_dirty[t] = 0; +} + + +/* Thankfully, textures with samplers follow the normal rules. */ +static void +nve4_compute_validate_samplers(struct nvc0_context *nvc0) +{ + boolean need_flush = nve4_validate_tsc(nvc0, 5); + if (need_flush) { + BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1); + PUSH_DATA (nvc0->base.pushbuf, 0); + } +} +/* (Code duplicated at bottom for various non-convincing reasons. + * E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC + * entries to avoid a subchannel switch. + * Same for texture cache flushes. + * Also, the bufctx differs, and more IFs in the 3D version looks ugly.) + */ +static void nve4_compute_validate_textures(struct nvc0_context *); + +static void +nve4_compute_set_tex_handles(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + uint64_t address; + const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE); + unsigned i, n; + uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s]; + + if (!dirty) + return; + i = ffs(dirty) - 1; + n = util_logbase2(dirty) + 1 - i; + assert(n); + + address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i); + + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address); + PUSH_DATA (push, address); + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2); + PUSH_DATA (push, n * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + n); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA); + PUSH_DATAp(push, &nvc0->tex_handles[s][i], n); + + BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); + + nvc0->textures_dirty[s] = 0; + nvc0->samplers_dirty[s] = 0; +} + + +static boolean +nve4_compute_validate_program(struct nvc0_context *nvc0) +{ + struct nvc0_program *prog = nvc0->compprog; + + if (prog->mem) + return TRUE; + + if (!prog->translated) { + prog->translated = nvc0_program_translate( + prog, nvc0->screen->base.device->chipset); + if (!prog->translated) + return FALSE; + } + if (unlikely(!prog->code_size)) + return FALSE; + + if (likely(prog->code_size)) { + if (nvc0_program_upload_code(nvc0, prog)) { + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CODE); + return TRUE; + } + } + return FALSE; +} + + +static boolean +nve4_compute_state_validate(struct nvc0_context *nvc0) +{ + if (!nve4_compute_validate_program(nvc0)) + return FALSE; + if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES) + nve4_compute_validate_textures(nvc0); + if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS) + nve4_compute_validate_samplers(nvc0); + if (nvc0->dirty_cp & (NVC0_NEW_CP_TEXTURES | NVC0_NEW_CP_SAMPLERS)) + nve4_compute_set_tex_handles(nvc0); + if (nvc0->dirty_cp & NVC0_NEW_CP_SURFACES) + nve4_compute_validate_surfaces(nvc0); + if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS) + nvc0_validate_global_residents(nvc0, + nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL); + + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE); + + nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); + if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) + return FALSE; + if (unlikely(nvc0->state.flushed)) + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE); + + return TRUE; +} + + +static void +nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_program *cp = nvc0->compprog; + + if (!cp->parm_size) + return; + + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2); + PUSH_DATAh(push, screen->parm->offset); + PUSH_DATA (push, screen->parm->offset); + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2); + PUSH_DATA (push, cp->parm_size); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (cp->parm_size / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA); + PUSH_DATAp(push, input, cp->parm_size / 4); + + BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); +} + +static INLINE uint8_t +nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size) +{ + if (shared_size > (32 << 10)) + return NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1; + if (shared_size > (16 << 10)) + return NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1; + return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1; +} + +static void +nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, + struct nve4_cp_launch_desc *desc, + uint32_t label, + const uint *block_layout, + const uint *grid_layout) +{ + const struct nvc0_screen *screen = nvc0->screen; + const struct nvc0_program *cp = nvc0->compprog; + unsigned i; + + nve4_cp_launch_desc_init_default(desc); + + desc->entry = nvc0_program_symbol_offset(cp, label); + + desc->griddim_x = grid_layout[0]; + desc->griddim_y = grid_layout[1]; + desc->griddim_z = grid_layout[2]; + desc->blockdim_x = block_layout[0]; + desc->blockdim_y = block_layout[1]; + desc->blockdim_z = block_layout[2]; + + desc->shared_size = align(cp->cp.smem_size, 0x100); + desc->local_size_p = align(cp->cp.lmem_size, 0x10); + desc->local_size_n = 0; + desc->cstack_size = 0x800; + desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size); + + desc->gpr_alloc = cp->num_gprs; + desc->bar_alloc = cp->num_barriers; + + for (i = 0; i < 7; ++i) { + const unsigned s = 5; + if (nvc0->constbuf[s][i].u.buf) + nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]); + } + nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE); +} + +static INLINE struct nve4_cp_launch_desc * +nve4_compute_alloc_launch_desc(struct nouveau_context *nv, + struct nouveau_bo **pbo, uint64_t *pgpuaddr) +{ + uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo); + if (!ptr) + return NULL; + if (*pgpuaddr & 255) { + unsigned adj = 256 - (*pgpuaddr & 255); + ptr += adj; + *pgpuaddr += adj; + } + return (struct nve4_cp_launch_desc *)ptr; +} + +void +nve4_launch_grid(struct pipe_context *pipe, + const uint *block_layout, const uint *grid_layout, + uint32_t label, + const void *input) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nve4_cp_launch_desc *desc; + uint64_t desc_gpuaddr; + struct nouveau_bo *desc_bo; + int ret; + + desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr); + if (!desc) + goto out; + BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD, + desc_bo); + + ret = !nve4_compute_state_validate(nvc0); + if (ret) + goto out; + + nve4_compute_setup_launch_desc(nvc0, desc, label, block_layout, grid_layout); + nve4_compute_dump_launch_desc(desc); + + nve4_compute_upload_input(nvc0, input); + + /* upload descriptor and flush */ +#if 0 + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr); + PUSH_DATA (push, desc_gpuaddr); + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2); + PUSH_DATA (push, 256); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL); + BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (256 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DESC); + PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); + BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE); +#endif + BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH_DESC_ADDRESS), 1); + PUSH_DATA (push, desc_gpuaddr >> 8); + BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH), 1); + PUSH_DATA (push, 0x3); + BEGIN_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1); + PUSH_DATA (push, 0); + +out: + if (ret) + NOUVEAU_ERR("Failed to launch grid !\n"); + nouveau_scratch_done(&nvc0->base); + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC); +} + + +#define NVE4_TIC_ENTRY_INVALID 0x000fffff + +static void +nve4_compute_validate_textures(struct nvc0_context *nvc0) +{ + struct nouveau_bo *txc = nvc0->screen->txc; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const unsigned s = 5; + unsigned i; + uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX]; + unsigned n[2] = { 0, 0 }; + + for (i = 0; i < nvc0->num_textures[s]; ++i) { + struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]); + struct nv04_resource *res; + const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i)); + + if (!tic) { + nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; + continue; + } + res = nv04_resource(tic->pipe.texture); + + if (tic->id < 0) { + tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); + + PUSH_SPACE(push, 16); + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_ADDRESS_HIGH), 2); + PUSH_DATAh(push, txc->offset + (tic->id * 32)); + PUSH_DATA (push, txc->offset + (tic->id * 32)); + BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_SIZE), 2); + PUSH_DATA (push, 32); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL); + BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 9); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA); + PUSH_DATAp(push, &tic->tic[0], 8); + + commands[0][n[0]++] = (tic->id << 4) | 1; + } else + if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { + commands[1][n[1]++] = (tic->id << 4) | 1; + } + nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); + + res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; + res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; + + nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID; + nvc0->tex_handles[s][i] |= tic->id; + if (dirty) + BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD); + } + for (; i < nvc0->state.num_textures[s]; ++i) + nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; + + if (n[0]) { + BEGIN_NIC0(push, NVE4_COMPUTE(TIC_FLUSH), n[0]); + PUSH_DATAp(push, commands[0], n[0]); + } + if (n[1]) { + BEGIN_NIC0(push, NVE4_COMPUTE(TEX_CACHE_CTL), n[1]); + PUSH_DATAp(push, commands[1], n[1]); + } + + nvc0->state.num_textures[s] = nvc0->num_textures[s]; +} + + +static const char *nve4_cache_split_name(unsigned value) +{ + switch (value) { + case NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1: return "16K_SHARED_48K_L1"; + case NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1: return "32K_SHARED_32K_L1"; + case NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1: return "48K_SHARED_16K_L1"; + default: + return "(invalid)"; + } +} + +static void +nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) +{ + const uint32_t *data = (const uint32_t *)desc; + unsigned i; + boolean zero = FALSE; + + debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n"); + + for (i = 0; i < sizeof(*desc); i += 4) { + if (data[i / 4]) { + debug_printf("[%x]: 0x%08x\n", i, data[i / 4]); + zero = FALSE; + } else + if (!zero) { + debug_printf("...\n"); + zero = TRUE; + } + } + + debug_printf("entry = 0x%x\n", desc->entry); + debug_printf("grid dimensions = %ux%ux%u\n", + desc->griddim_x, desc->griddim_y, desc->griddim_z); + debug_printf("block dimensions = %ux%ux%u\n", + desc->blockdim_x, desc->blockdim_y, desc->blockdim_z); + debug_printf("s[] size: 0x%x\n", desc->shared_size); + debug_printf("l[] size: -0x%x / +0x%x\n", + desc->local_size_n, desc->local_size_p); + debug_printf("stack size: 0x%x\n", desc->cstack_size); + debug_printf("barrier count: %u\n", desc->bar_alloc); + debug_printf("$r count: %u\n", desc->gpr_alloc); + debug_printf("cache split: %s\n", nve4_cache_split_name(desc->cache_split)); + + for (i = 0; i < 8; ++i) { + uint64_t address; + uint32_t size = desc->cb[i].size; + boolean valid = !!(desc->cb_mask & (1 << i)); + + address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l; + + if (!valid && !address && !size) + continue; + debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n", + i, address, size, valid ? "" : " (invalid)"); + } +} + diff --git a/src/gallium/drivers/nvc0/nve4_compute.h b/src/gallium/drivers/nvc0/nve4_compute.h new file mode 100644 index 00000000000..82a77480c35 --- /dev/null +++ b/src/gallium/drivers/nvc0/nve4_compute.h @@ -0,0 +1,110 @@ + +#ifndef NVE4_COMPUTE_H +#define NVE4_COMPUTE_H + +#include "nv50/nv50_defs.xml.h" +#include "nve4_compute.xml.h" + +/* Input space is implemented as c0[], to which we bind the screen->parm bo. + */ +#define NVE4_CP_INPUT_USER 0x0000 +#define NVE4_CP_INPUT_USER_LIMIT 0x1000 +#define NVE4_CP_INPUT_TEX(i) (0x1020 + (i) * 4) +#define NVE4_CP_INPUT_TEX_STRIDE 4 +#define NVE4_CP_INPUT_TEX_MAX 32 +#define NVE4_CP_INPUT_MS_OFFSETS 0x10c0 +#define NVE4_CP_INPUT_SUF_STRIDE 64 +#define NVE4_CP_INPUT_SUF(i) (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE) +#define NVE4_CP_INPUT_SUF_MAX 32 +#define NVE4_CP_INPUT_SIZE 0x1900 +#define NVE4_CP_PARAM_SIZE 0x2000 + +struct nve4_cp_launch_desc +{ + u32 unk0[8]; + u32 entry; + u32 unk9[3]; + u32 griddim_x : 31; + u32 unk12 : 1; + u16 griddim_y; + u16 griddim_z; + u32 unk14[3]; + u16 shared_size; /* must be aligned to 0x100 */ + u16 unk15; + u16 unk16; + u16 blockdim_x; + u16 blockdim_y; + u16 blockdim_z; + u32 cb_mask : 8; + u32 unk20_8 : 21; + u32 cache_split : 2; + u32 unk20_31 : 1; + u32 unk21[8]; + struct { + u32 address_l; + u32 address_h : 8; + u32 reserved : 7; + u32 size : 17; + } cb[8]; + u32 local_size_p : 20; + u32 unk45_20 : 7; + u32 bar_alloc : 5; + u32 local_size_n : 20; + u32 unk46_20 : 4; + u32 gpr_alloc : 8; + u32 cstack_size : 20; + u32 unk47_20 : 12; + u32 unk48[16]; +}; + +#define NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DATA 0x41 +#define NVE4_COMPUTE_UPLOAD_EXEC_UNKVAL_DESC 0x11 +#define NVE4_COMPUTE_UPLOAD_UNK0184_UNKVAL 0x1 + +static INLINE void +nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc) +{ + memset(desc, 0, sizeof(*desc)); + + desc->unk0[7] = 0xbc000000; + desc->unk9[2] = 0x44014000; + desc->unk47_20 = 0x300; +} + +static INLINE void +nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc, + unsigned index, + struct nouveau_bo *bo, + uint32_t base, uint16_t size) +{ + uint64_t address = bo->offset + base; + + assert(index < 8); + assert(!(base & 0xff)); + assert(size <= 65536); + + desc->cb[index].address_l = address; + desc->cb[index].address_h = address >> 32; + desc->cb[index].size = size; + + desc->cb_mask |= 1 << index; +} + +static INLINE void +nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc, + unsigned index, + const struct nvc0_constbuf *cb) +{ + assert(index < 8); + + if (!cb->u.buf) { + desc->cb_mask &= ~(1 << index); + } else { + const struct nv04_resource *buf = nv04_resource(cb->u.buf); + assert(!cb->user); + nve4_cp_launch_desc_set_cb(desc, index, + buf->bo, buf->offset + cb->offset, cb->size); + } +} + +#endif /* NVE4_COMPUTE_H */ diff --git a/src/gallium/drivers/nvc0/nve4_compute.xml.h b/src/gallium/drivers/nvc0/nve4_compute.xml.h new file mode 100644 index 00000000000..e513ae7eb86 --- /dev/null +++ b/src/gallium/drivers/nvc0/nve4_compute.xml.h @@ -0,0 +1,269 @@ +#ifndef NVE4_COMPUTE_XML +#define NVE4_COMPUTE_XML + +/* Autogenerated file, DO NOT EDIT manually! + +This file was generated by the rules-ng-ng headergen tool in this git repository: +http://0x04.net/cgit/index.cgi/rules-ng-ng +git clone git://0x04.net/rules-ng-ng + +The rules-ng-ng source files this header was generated from are: +- nve4_compute.xml ( 6352 bytes, from 2013-03-10 14:59:45) +- copyright.xml ( 6452 bytes, from 2011-08-11 18:25:12) +- nvchipsets.xml ( 3870 bytes, from 2013-03-08 12:41:50) +- nv_object.xml ( 13238 bytes, from 2013-02-07 16:35:34) +- nv_defs.xml ( 4437 bytes, from 2011-08-11 18:25:12) +- nv50_defs.xml ( 7783 bytes, from 2013-03-08 12:42:29) + +Copyright (C) 2006-2013 by the following authors: +- Artur Huillet (ahuillet) +- Ben Skeggs (darktama, darktama_) +- B. R. (koala_br) +- Carlos Martin (carlosmn) +- Christoph Bumiller (calim, chrisbmr) +- Dawid Gajownik (gajownik) +- Dmitry Baryshkov +- Dmitry Eremin-Solenikov (lumag) +- EdB (edb_) +- Erik Waling (erikwaling) +- Francisco Jerez (curro) +- imirkin (imirkin) +- jb17bsome (jb17bsome) +- Jeremy Kolb (kjeremy) +- Laurent Carlier (lordheavy) +- Luca Barbieri (lb, lb1) +- Maarten Maathuis (stillunknown) +- Marcin Kościelnicki (mwk, koriakin) +- Mark Carey (careym) +- Matthieu Castet (mat-c) +- nvidiaman (nvidiaman) +- Patrice Mandin (pmandin, pmdata) +- Pekka Paalanen (pq, ppaalanen) +- Peter Popov (ironpeter) +- Richard Hughes (hughsient) +- Rudi Cilibrasi (cilibrar) +- Serge Martin +- Simon Raffeiner +- Stephane Loeuillet (leroutier) +- Stephane Marchesin (marcheu) +- sturmflut (sturmflut) +- Sylvain Munaut +- Victor Stinner (haypo) +- Wladmir van der Laan (miathan6) +- Younes Manton (ymanton) + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice (including the +next paragraph) shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + + +#define NVE4_COMPUTE_UPLOAD_SIZE 0x00000180 + +#define NVE4_COMPUTE_UPLOAD_UNK0184 0x00000184 + +#define NVE4_COMPUTE_UPLOAD_ADDRESS_HIGH 0x00000188 + +#define NVE4_COMPUTE_UPLOAD_ADDRESS_LOW 0x0000018c + +#define NVE4_COMPUTE_UNK01A0 0x000001a0 + +#define NVE4_COMPUTE_UNK01A4 0x000001a4 + +#define NVE4_COMPUTE_UNK01A8 0x000001a8 + +#define NVE4_COMPUTE_UNK01AC 0x000001ac + +#define NVE4_COMPUTE_UPLOAD_EXEC 0x000001b0 + +#define NVE4_COMPUTE_UPLOAD_DATA 0x000001b4 + +#define NVE4_COMPUTE_SHARED_BASE 0x00000214 + +#define NVE4_COMPUTE_MEM_BARRIER 0x0000021c + +#define NVE4_COMPUTE_UNK0280 0x00000280 + +#define NVE4_COMPUTE_UNK02B0 0x000002b0 + +#define NVE4_COMPUTE_LAUNCH_DESC_ADDRESS 0x000002b4 +#define NVE4_COMPUTE_LAUNCH_DESC_ADDRESS__SHR 8 + +#define NVE4_COMPUTE_UNK02B8 0x000002b8 + +#define NVE4_COMPUTE_LAUNCH 0x000002bc + +#define NVE4_COMPUTE_TEMP_SIZE(i0) (0x000002e4 + 0xc*(i0)) +#define NVE4_COMPUTE_TEMP_SIZE__ESIZE 0x0000000c +#define NVE4_COMPUTE_TEMP_SIZE__LEN 0x00000002 + +#define NVE4_COMPUTE_TEMP_SIZE_HIGH(i0) (0x000002e4 + 0xc*(i0)) + +#define NVE4_COMPUTE_TEMP_SIZE_LOW(i0) (0x000002e8 + 0xc*(i0)) + +#define NVE4_COMPUTE_TEMP_SIZE_MASK(i0) (0x000002ec + 0xc*(i0)) + +#define NVE4_COMPUTE_UNK0310 0x00000310 + +#define NVE4_COMPUTE_LOCAL_BASE 0x0000077c + +#define NVE4_COMPUTE_TEMP_ADDRESS_HIGH 0x00000790 + +#define NVE4_COMPUTE_TEMP_ADDRESS_LOW 0x00000794 + +#define NVE4_COMPUTE_WATCHDOG_TIMER 0x00000de4 + +#define NVE4_COMPUTE_LINKED_TSC 0x00001234 + +#define NVE4_COMPUTE_TSC_FLUSH 0x00001330 +#define NVE4_COMPUTE_TSC_FLUSH_SPECIFIC 0x00000001 +#define NVE4_COMPUTE_TSC_FLUSH_ENTRY__MASK 0x03fffff0 +#define NVE4_COMPUTE_TSC_FLUSH_ENTRY__SHIFT 4 + +#define NVE4_COMPUTE_TIC_FLUSH 0x00001334 +#define NVE4_COMPUTE_TIC_FLUSH_SPECIFIC 0x00000001 +#define NVE4_COMPUTE_TIC_FLUSH_ENTRY__MASK 0x03fffff0 +#define NVE4_COMPUTE_TIC_FLUSH_ENTRY__SHIFT 4 + +#define NVE4_COMPUTE_TEX_CACHE_CTL 0x00001338 +#define NVE4_COMPUTE_TEX_CACHE_CTL_UNK0__MASK 0x00000007 +#define NVE4_COMPUTE_TEX_CACHE_CTL_UNK0__SHIFT 0 +#define NVE4_COMPUTE_TEX_CACHE_CTL_ENTRY__MASK 0x03fffff0 +#define NVE4_COMPUTE_TEX_CACHE_CTL_ENTRY__SHIFT 4 + +#define NVE4_COMPUTE_COND_ADDRESS_HIGH 0x00001550 + +#define NVE4_COMPUTE_COND_ADDRESS_LOW 0x00001554 + +#define NVE4_COMPUTE_COND_MODE 0x00001558 +#define NVE4_COMPUTE_COND_MODE_NEVER 0x00000000 +#define NVE4_COMPUTE_COND_MODE_ALWAYS 0x00000001 +#define NVE4_COMPUTE_COND_MODE_RES_NON_ZERO 0x00000002 +#define NVE4_COMPUTE_COND_MODE_EQUAL 0x00000003 +#define NVE4_COMPUTE_COND_MODE_NOT_EQUAL 0x00000004 + +#define NVE4_COMPUTE_TSC_ADDRESS_HIGH 0x0000155c + +#define NVE4_COMPUTE_TSC_ADDRESS_LOW 0x00001560 + +#define NVE4_COMPUTE_TSC_LIMIT 0x00001564 + +#define NVE4_COMPUTE_TIC_ADDRESS_HIGH 0x00001574 + +#define NVE4_COMPUTE_TIC_ADDRESS_LOW 0x00001578 + +#define NVE4_COMPUTE_TIC_LIMIT 0x0000157c + +#define NVE4_COMPUTE_CODE_ADDRESS_HIGH 0x00001608 + +#define NVE4_COMPUTE_CODE_ADDRESS_LOW 0x0000160c + +#define NVE4_COMPUTE_FLUSH 0x00001698 +#define NVE4_COMPUTE_FLUSH_CODE 0x00000001 +#define NVE4_COMPUTE_FLUSH_GLOBAL 0x00000010 +#define NVE4_COMPUTE_FLUSH_UNK8 0x00000100 +#define NVE4_COMPUTE_FLUSH_CB 0x00001000 + +#define NVE4_COMPUTE_QUERY_ADDRESS_HIGH 0x00001b00 + +#define NVE4_COMPUTE_QUERY_ADDRESS_LOW 0x00001b04 + +#define NVE4_COMPUTE_QUERY_SEQUENCE 0x00001b08 + +#define NVE4_COMPUTE_QUERY_GET 0x00001b0c +#define NVE4_COMPUTE_QUERY_GET_MODE__MASK 0x00000003 +#define NVE4_COMPUTE_QUERY_GET_MODE__SHIFT 0 +#define NVE4_COMPUTE_QUERY_GET_MODE_WRITE 0x00000000 +#define NVE4_COMPUTE_QUERY_GET_MODE_WRITE_INTR_NRHOST 0x00000003 +#define NVE4_COMPUTE_QUERY_GET_INTR 0x00100000 +#define NVE4_COMPUTE_QUERY_GET_SHORT 0x10000000 + +#define NVE4_COMPUTE_TEX_CB_INDEX 0x00002608 + +#define NVE4_COMPUTE_UNK260c 0x0000260c + +#define NVE4_COMPUTE_LAUNCH_DESC__SIZE 0x00000100 +#define NVE4_COMPUTE_LAUNCH_DESC_PROG_START 0x00000020 + +#define NVE4_COMPUTE_LAUNCH_DESC_12 0x00000030 +#define NVE4_COMPUTE_LAUNCH_DESC_12_GRIDDIM_X__MASK 0x7fffffff +#define NVE4_COMPUTE_LAUNCH_DESC_12_GRIDDIM_X__SHIFT 0 + +#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ 0x00000034 +#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Y__MASK 0x0000ffff +#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Y__SHIFT 0 +#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Z__MASK 0xffff0000 +#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Z__SHIFT 16 + +#define NVE4_COMPUTE_LAUNCH_DESC_17 0x00000044 +#define NVE4_COMPUTE_LAUNCH_DESC_17_SHARED_ALLOC__MASK 0x0000ffff +#define NVE4_COMPUTE_LAUNCH_DESC_17_SHARED_ALLOC__SHIFT 0 + +#define NVE4_COMPUTE_LAUNCH_DESC_18 0x00000048 +#define NVE4_COMPUTE_LAUNCH_DESC_18_BLOCKDIM_X__MASK 0xffff0000 +#define NVE4_COMPUTE_LAUNCH_DESC_18_BLOCKDIM_X__SHIFT 16 + +#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ 0x0000004c +#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Y__MASK 0x0000ffff +#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Y__SHIFT 0 +#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Z__MASK 0xffff0000 +#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Z__SHIFT 16 + +#define NVE4_COMPUTE_LAUNCH_DESC_20 0x00000050 +#define NVE4_COMPUTE_LAUNCH_DESC_20_CB_VALID__MASK 0x000000ff +#define NVE4_COMPUTE_LAUNCH_DESC_20_CB_VALID__SHIFT 0 +#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT__MASK 0x60000000 +#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT__SHIFT 29 +#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_16K_SHARED_48K_L1 0x20000000 +#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_32K_SHARED_32K_L1 0x40000000 +#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_48K_SHARED_16K_L1 0x60000000 + +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0(i0) (0x00000074 + 0x8*(i0)) +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0__ESIZE 0x00000008 +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0__LEN 0x00000008 +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0_ADDRESS_LOW__MASK 0xffffffff +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0_ADDRESS_LOW__SHIFT 0 + +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1(i0) (0x00000078 + 0x8*(i0)) +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1__ESIZE 0x00000008 +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1__LEN 0x00000008 +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_ADDRESS_HIGH__MASK 0x000000ff +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_ADDRESS_HIGH__SHIFT 0 +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_SIZE__MASK 0xffff8000 +#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_SIZE__SHIFT 15 + +#define NVE4_COMPUTE_LAUNCH_DESC_45 0x000000b4 +#define NVE4_COMPUTE_LAUNCH_DESC_45_LOCAL_POS_ALLOC__MASK 0x000fffff +#define NVE4_COMPUTE_LAUNCH_DESC_45_LOCAL_POS_ALLOC__SHIFT 0 +#define NVE4_COMPUTE_LAUNCH_DESC_45_BARRIER_ALLOC__MASK 0xf8000000 +#define NVE4_COMPUTE_LAUNCH_DESC_45_BARRIER_ALLOC__SHIFT 27 + +#define NVE4_COMPUTE_LAUNCH_DESC_46 0x000000b8 +#define NVE4_COMPUTE_LAUNCH_DESC_46_LOCAL_NEG_ALLOC__MASK 0x000fffff +#define NVE4_COMPUTE_LAUNCH_DESC_46_LOCAL_NEG_ALLOC__SHIFT 0 +#define NVE4_COMPUTE_LAUNCH_DESC_46_GPR_ALLOC__MASK 0x3f000000 +#define NVE4_COMPUTE_LAUNCH_DESC_46_GPR_ALLOC__SHIFT 24 + +#define NVE4_COMPUTE_LAUNCH_DESC_47 0x000000bc +#define NVE4_COMPUTE_LAUNCH_DESC_47_WARP_CSTACK_SIZE__MASK 0x000fffff +#define NVE4_COMPUTE_LAUNCH_DESC_47_WARP_CSTACK_SIZE__SHIFT 0 + + +#endif /* NVE4_COMPUTE_XML */