From: Adam Rak Date: Wed, 30 Nov 2011 21:20:41 +0000 (+0100) Subject: r600g: compute support for evergreen X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=6a829a1b724ca0d960decee217d260b4de8a5463;p=mesa.git r600g: compute support for evergreen Tom Stellard: - Updated for gallium interface changes - Fixed a few bugs: + Set the loop counter + Calculate the correct number of pipes - Added hooks into the LLVM compiler --- diff --git a/configure.ac b/configure.ac index db68a87f007..527accca522 100644 --- a/configure.ac +++ b/configure.ac @@ -1993,13 +1993,18 @@ if test "x$with_gallium_drivers" != x; then PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED]) gallium_require_drm_loader GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS r600" - if test "x$enable_r600_llvm" = xyes; then + if test "x$enable_r600_llvm" = xyes -o "x$enable_opencl" = xyes; then if test "x$LLVM_VERSION" != "x3.1"; then AC_MSG_ERROR([LLVM 3.1 is required for the r600 llvm compiler.]) fi NEED_RADEON_GALLIUM=yes; + fi + if test "x$enable_r600_llvm" = xyes; then USE_R600_LLVM_COMPILER=yes; fi + if test "x$enable_opencl" = xyes -a "x$with_llvm_shared_libs" = xno; then + LLVM_LIBS="${LLVM_LIBS} `llvm-config --libs bitreader asmparser`" + fi gallium_check_st "radeon/drm" "dri-r600" "xorg-r600" "" "xvmc-r600" "vdpau-r600" "va-r600" ;; xradeonsi) diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am index 77d2674d262..31d885a3416 100644 --- a/src/gallium/drivers/r600/Makefile.am +++ b/src/gallium/drivers/r600/Makefile.am @@ -18,7 +18,7 @@ AM_CFLAGS = \ libr600_a_SOURCES = \ $(C_SOURCES) -if USE_R600_LLVM_COMPILER +if NEED_RADEON_GALLIUM # This is a hack until we can move the backend into the LLVM project. # We need to use mklib, because it splits up libradeon.a into object files @@ -26,18 +26,28 @@ if USE_R600_LLVM_COMPILER libr600_a_AR = $(top_srcdir)/bin/mklib -o r600 -static libr600_a_SOURCES += \ - $(LLVM_C_SOURCES) + $(LLVM_C_SOURCES) \ + $(LLVM_CXX_SOURCES) libr600_a_LIBADD = \ $(top_builddir)/src/gallium/drivers/radeon/libradeon.a AM_CFLAGS += \ $(LLVM_CFLAGS) \ - -I$(top_srcdir)/src/gallium/drivers/radeon/ \ - -DR600_USE_LLVM + -I$(top_srcdir)/src/gallium/drivers/radeon/ AM_CXXFLAGS= \ $(LLVM_CXXFLAGS) else libr600_a_AR = $(AR) $(ARFLAGS) endif + +if USE_R600_LLVM_COMPILER +AM_CFLAGS += \ + -DR600_USE_LLVM +endif + +if HAVE_GALLIUM_COMPUTE +AM_CFLAGS += \ + -DHAVE_OPENCL +endif diff --git a/src/gallium/drivers/r600/Makefile.sources b/src/gallium/drivers/r600/Makefile.sources index b7b0d50b637..50546e6fb2f 100644 --- a/src/gallium/drivers/r600/Makefile.sources +++ b/src/gallium/drivers/r600/Makefile.sources @@ -14,6 +14,10 @@ C_SOURCES = \ evergreen_state.c \ eg_asm.c \ r600_translate.c \ - r600_state_common.c + r600_state_common.c \ + evergreen_compute.c \ + evergreen_compute_internal.c \ + compute_memory_pool.c LLVM_C_SOURCES = r600_llvm.c +LLVM_CXX_SOURCES = llvm_wrapper.cpp diff --git a/src/gallium/drivers/r600/compute_memory_pool.c b/src/gallium/drivers/r600/compute_memory_pool.c new file mode 100644 index 00000000000..01bf0c33dfd --- /dev/null +++ b/src/gallium/drivers/r600/compute_memory_pool.c @@ -0,0 +1,397 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Adam Rak + */ + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "util/u_blitter.h" +#include "util/u_double_list.h" +#include "util/u_transfer.h" +#include "util/u_surface.h" +#include "util/u_pack_color.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_framebuffer.h" +#include "r600.h" +#include "r600_resource.h" +#include "r600_shader.h" +#include "r600_pipe.h" +#include "r600_formats.h" +#include "compute_memory_pool.h" +#include "evergreen_compute_internal.h" + +/** + * Creates a new pool + */ +struct compute_memory_pool* compute_memory_pool_new( + int64_t initial_size_in_dw, + struct r600_screen * rscreen) +{ + struct compute_memory_pool* pool = (struct compute_memory_pool*) + CALLOC(sizeof(struct compute_memory_pool), 1); + + pool->next_id = 1; + pool->size_in_dw = initial_size_in_dw; + pool->screen = rscreen; + pool->bo = (struct r600_resource*)r600_compute_buffer_alloc_vram( + pool->screen, pool->size_in_dw*4); + pool->shadow = (uint32_t*)CALLOC(4, pool->size_in_dw); + + return pool; +} + +/** + * Frees all stuff in the pool and the pool struct itself too + */ +void compute_memory_pool_delete(struct compute_memory_pool* pool) +{ + free(pool->shadow); + pool->screen->screen.resource_destroy((struct pipe_screen *) + pool->screen, (struct pipe_resource *)pool->bo); + free(pool); +} + +/** + * Searches for an empty space in the pool, return with the pointer to the + * allocatable space in the pool, returns -1 on failure. + */ +int64_t compute_memory_prealloc_chunk( + struct compute_memory_pool* pool, + int64_t size_in_dw) +{ + assert(size_in_dw <= pool->size_in_dw); + + struct compute_memory_item *item; + + int last_end = 0; + + for (item = pool->item_list; item; item = item->next) { + if (item->start_in_dw > -1) { + if (item->start_in_dw-last_end > size_in_dw) { + return last_end; + } + + last_end = item->start_in_dw + item->size_in_dw; + last_end += (1024 - last_end % 1024); + } + } + + if (pool->size_in_dw - last_end < size_in_dw) { + return -1; + } + + return last_end; +} + +/** + * Search for the chunk where we can link our new chunk after it. + */ +struct compute_memory_item* compute_memory_postalloc_chunk( + struct compute_memory_pool* pool, + int64_t start_in_dw) +{ + struct compute_memory_item* item; + + for (item = pool->item_list; item; item = item->next) { + if (item->next) { + if (item->start_in_dw < start_in_dw + && item->next->start_in_dw > start_in_dw) { + return item; + } + } + else { + /* end of chain */ + assert(item->start_in_dw < start_in_dw); + return item; + } + } + + assert(0 && "unreachable"); + return NULL; +} + +/** + * Reallocates pool, conserves data + */ +void compute_memory_grow_pool(struct compute_memory_pool* pool, + struct pipe_context * pipe, int new_size_in_dw) +{ + assert(new_size_in_dw >= pool->size_in_dw); + + new_size_in_dw += 1024 - (new_size_in_dw % 1024); + + compute_memory_shadow(pool, pipe, 1); + pool->shadow = (uint32_t*)realloc(pool->shadow, new_size_in_dw*4); + pool->size_in_dw = new_size_in_dw; + pool->screen->screen.resource_destroy( + (struct pipe_screen *)pool->screen, + (struct pipe_resource *)pool->bo); + pool->bo = r600_compute_buffer_alloc_vram(pool->screen, + pool->size_in_dw*4); + compute_memory_shadow(pool, pipe, 0); +} + +/** + * Copy pool from device to host, or host to device. + */ +void compute_memory_shadow(struct compute_memory_pool* pool, + struct pipe_context * pipe, int device_to_host) +{ + struct compute_memory_item chunk; + + chunk.id = 0; + chunk.start_in_dw = 0; + chunk.size_in_dw = pool->size_in_dw; + chunk.prev = chunk.next = NULL; + compute_memory_transfer(pool, pipe, device_to_host, &chunk, + pool->shadow, 0, pool->size_in_dw*4); +} + +/** + * Allocates pending allocations in the pool + */ +void compute_memory_finalize_pending(struct compute_memory_pool* pool, + struct pipe_context * pipe) +{ + struct compute_memory_item *pending_list = NULL, *end_p = NULL; + struct compute_memory_item *item, *next; + + int64_t allocated = 0; + int64_t unallocated = 0; + + for (item = pool->item_list; item; item = item->next) { + COMPUTE_DBG("list: %i %p\n", item->start_in_dw, item->next); + } + + for (item = pool->item_list; item; item = next) { + next = item->next; + + + if (item->start_in_dw == -1) { + if (end_p) { + end_p->next = item; + } + else { + pending_list = item; + } + + if (item->prev) { + item->prev->next = next; + } + else { + pool->item_list = next; + } + + if (next) { + next->prev = item->prev; + } + + item->prev = end_p; + item->next = NULL; + end_p = item; + + unallocated += item->size_in_dw+1024; + } + else { + allocated += item->size_in_dw; + } + } + + if (pool->size_in_dw < allocated+unallocated) { + compute_memory_grow_pool(pool, pipe, allocated+unallocated); + } + + for (item = pending_list; item; item = next) { + next = item->next; + + int64_t start_in_dw; + + while ((start_in_dw=compute_memory_prealloc_chunk(pool, + item->size_in_dw)) == -1) { + int64_t need = item->size_in_dw+2048 - + (pool->size_in_dw - allocated); + + need += 1024 - (need % 1024); + + if (need > 0) { + compute_memory_grow_pool(pool, + pipe, + pool->size_in_dw + need); + } + else { + need = pool->size_in_dw / 10; + need += 1024 - (need % 1024); + compute_memory_grow_pool(pool, + pipe, + pool->size_in_dw + need); + } + } + + item->start_in_dw = start_in_dw; + item->next = NULL; + item->prev = NULL; + + if (pool->item_list) { + struct compute_memory_item *pos; + + pos = compute_memory_postalloc_chunk(pool, start_in_dw); + item->prev = pos; + item->next = pos->next; + pos->next = item; + + if (item->next) { + item->next->prev = item; + } + } + else { + pool->item_list = item; + } + + allocated += item->size_in_dw; + } +} + + +void compute_memory_free(struct compute_memory_pool* pool, int64_t id) +{ + struct compute_memory_item *item, *next; + + for (item = pool->item_list; item; item = next) { + next = item->next; + + if (item->id == id) { + if (item->prev) { + item->prev->next = item->next; + } + else { + pool->item_list = item->next; + } + + if (item->next) { + item->next->prev = item->prev; + } + + free(item); + + return; + } + } + + fprintf(stderr, "Internal error, invalid id %ld " + "for compute_memory_free\n", id); + + assert(0 && "error"); +} + +/** + * Creates pending allocations + */ +struct compute_memory_item* compute_memory_alloc( + struct compute_memory_pool* pool, + int64_t size_in_dw) +{ + struct compute_memory_item *new_item; + + COMPUTE_DBG("Alloc: %i\n", size_in_dw); + + new_item = (struct compute_memory_item *) + CALLOC(sizeof(struct compute_memory_item), 1); + new_item->size_in_dw = size_in_dw; + new_item->start_in_dw = -1; /* mark pending */ + new_item->id = pool->next_id++; + new_item->pool = pool; + + struct compute_memory_item *last_item; + + if (pool->item_list) { + for (last_item = pool->item_list; last_item->next; + last_item = last_item->next); + + last_item->next = new_item; + new_item->prev = last_item; + } + else { + pool->item_list = new_item; + } + + return new_item; +} + +/** + * Transfer data host<->device, offset and size is in bytes + */ +void compute_memory_transfer( + struct compute_memory_pool* pool, + struct pipe_context * pipe, + int device_to_host, + struct compute_memory_item* chunk, + void* data, + int offset_in_chunk, + int size) +{ + int64_t aligned_size = pool->size_in_dw; + struct pipe_resource* gart = (struct pipe_resource*)pool->bo; + int64_t internal_offset = chunk->start_in_dw*4 + offset_in_chunk; + + struct pipe_transfer *xfer; + uint32_t *map; + + if (device_to_host) + { + xfer = pipe->get_transfer(pipe, gart, 0, PIPE_TRANSFER_READ, + &(struct pipe_box) { .width = aligned_size, + .height = 1, .depth = 1 }); + assert(xfer); + map = pipe->transfer_map(pipe, xfer); + assert(map); + memcpy(data, map + internal_offset, size); + pipe->transfer_unmap(pipe, xfer); + pipe->transfer_destroy(pipe, xfer); + } else { + xfer = pipe->get_transfer(pipe, gart, 0, PIPE_TRANSFER_WRITE, + &(struct pipe_box) { .width = aligned_size, + .height = 1, .depth = 1 }); + assert(xfer); + map = pipe->transfer_map(pipe, xfer); + assert(map); + memcpy(map + internal_offset, data, size); + pipe->transfer_unmap(pipe, xfer); + pipe->transfer_destroy(pipe, xfer); + } +} + +/** + * Transfer data between chunk<->data, it is for VRAM<->GART transfers + */ +void compute_memory_transfer_direct( + struct compute_memory_pool* pool, + int chunk_to_data, + struct compute_memory_item* chunk, + struct r600_resource* data, + int offset_in_chunk, + int offset_in_data, + int size) +{ + ///TODO: DMA +} diff --git a/src/gallium/drivers/r600/compute_memory_pool.h b/src/gallium/drivers/r600/compute_memory_pool.h new file mode 100644 index 00000000000..a14eba1df7e --- /dev/null +++ b/src/gallium/drivers/r600/compute_memory_pool.h @@ -0,0 +1,98 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Adam Rak + */ + +#ifndef COMPUTE_MEMORY_POOL +#define COMPUTE_MEMORY_POOL + +#include + +struct compute_memory_pool; + +struct compute_memory_item +{ + int64_t id; ///ID of the memory chunk + + int untouched; ///True if the memory contains only junk, no need to save it for defrag + + int64_t start_in_dw; ///Start pointer in dwords relative in the pool bo + int64_t size_in_dw; ///Size of the chunk in dwords + + struct compute_memory_pool* pool; + + struct compute_memory_item* prev; + struct compute_memory_item* next; +}; + +struct compute_memory_pool +{ + int64_t next_id; ///For generating unique IDs for memory chunks + int64_t size_in_dw; ///Size of the pool in dwords + + struct r600_resource *bo; ///The pool buffer object resource + struct compute_memory_item* item_list; ///Allocated memory chunks in the buffer,they must be ordered by "start_in_dw" + struct r600_screen *screen; + + uint32_t *shadow; ///host copy of the pool, used for defragmentation +}; + + +struct compute_memory_pool* compute_memory_pool_new(int64_t initial_size_in_dw, struct r600_screen *rscreen); ///Creates a new pool +void compute_memory_pool_delete(struct compute_memory_pool* pool); ///Frees all stuff in the pool and the pool struct itself too + +int64_t compute_memory_prealloc_chunk(struct compute_memory_pool* pool, int64_t size_in_dw); ///searches for an empty space in the pool, return with the pointer to the allocatable space in the pool, returns -1 on failure + +struct compute_memory_item* compute_memory_postalloc_chunk(struct compute_memory_pool* pool, int64_t start_in_dw); ///search for the chunk where we can link our new chunk after it + +/** + * reallocates pool, conserves data + */ +void compute_memory_grow_pool(struct compute_memory_pool* pool, struct pipe_context * pipe, + int new_size_in_dw); + +/** + * Copy pool from device to host, or host to device + */ +void compute_memory_shadow(struct compute_memory_pool* pool, + struct pipe_context * pipe, int device_to_host); + +/** + * Allocates pending allocations in the pool + */ +void compute_memory_finalize_pending(struct compute_memory_pool* pool, + struct pipe_context * pipe); +void compute_memory_defrag(struct compute_memory_pool* pool); ///Defragment the memory pool, always heavy memory usage +void compute_memory_free(struct compute_memory_pool* pool, int64_t id); +struct compute_memory_item* compute_memory_alloc(struct compute_memory_pool* pool, int64_t size_in_dw); ///Creates pending allocations + +/** + * Transfer data host<->device, offset and size is in bytes + */ +void compute_memory_transfer(struct compute_memory_pool* pool, + struct pipe_context * pipe, int device_to_host, + struct compute_memory_item* chunk, void* data, + int offset_in_chunk, int size); + +void compute_memory_transfer_direct(struct compute_memory_pool* pool, int chunk_to_data, struct compute_memory_item* chunk, struct r600_resource* data, int offset_in_chunk, int offset_in_data, int size); ///Transfer data between chunk<->data, it is for VRAM<->GART transfers + +#endif diff --git a/src/gallium/drivers/r600/compute_resource.def b/src/gallium/drivers/r600/compute_resource.def new file mode 100644 index 00000000000..161f5062ff7 --- /dev/null +++ b/src/gallium/drivers/r600/compute_resource.def @@ -0,0 +1,38 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Adam Rak + */ + + +DECL_COMPUTE_RESOURCE(CONFIG, 1) +DECL_COMPUTE_RESOURCE(CONST_MEM, 16) +DECL_COMPUTE_RESOURCE(RAT, 12) +DECL_COMPUTE_RESOURCE(VERT, 16) +DECL_COMPUTE_RESOURCE(TEX, 16) +DECL_COMPUTE_RESOURCE(SAMPLER, 18) +DECL_COMPUTE_RESOURCE(LOOP, 32) +DECL_COMPUTE_RESOURCE(LDS, 1) +DECL_COMPUTE_RESOURCE(GDS, 1) +DECL_COMPUTE_RESOURCE(EXPORT, 1) +DECL_COMPUTE_RESOURCE(SHADER, 1) +DECL_COMPUTE_RESOURCE(TMPRING, 4) +DECL_COMPUTE_RESOURCE(DISPATCH, 1) diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c new file mode 100644 index 00000000000..7aeb4038ae1 --- /dev/null +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -0,0 +1,814 @@ +/* + * Copyright 2011 Adam Rak + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Adam Rak + */ + +#include +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "util/u_blitter.h" +#include "util/u_double_list.h" +#include "util/u_transfer.h" +#include "util/u_surface.h" +#include "util/u_pack_color.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_framebuffer.h" +#include "pipebuffer/pb_buffer.h" +#include "r600.h" +#include "evergreend.h" +#include "r600_resource.h" +#include "r600_shader.h" +#include "r600_pipe.h" +#include "r600_formats.h" +#include "evergreen_compute.h" +#include "r600_hw_context_priv.h" +#include "evergreen_compute_internal.h" +#include "compute_memory_pool.h" +#ifdef HAVE_OPENCL +#include "llvm_wrapper.h" +#endif + +/** +RAT0 is for global binding write +VTX1 is for global binding read + +for wrting images RAT1... +for reading images TEX2... + TEX2-RAT1 is paired + +TEX2... consumes the same fetch resources, that VTX2... would consume + +CONST0 and VTX0 is for parameters + CONST0 is binding smaller input parameter buffer, and for constant indexing, + also constant cached + VTX0 is for indirect/non-constant indexing, or if the input is bigger than + the constant cache can handle + +RAT-s are limited to 12, so we can only bind at most 11 texture for writing +because we reserve RAT0 for global bindings. With byteaddressing enabled, +we should reserve another one too.=> 10 image binding for writing max. + +from Nvidia OpenCL: + CL_DEVICE_MAX_READ_IMAGE_ARGS: 128 + CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8 + +so 10 for writing is enough. 176 is the max for reading according to the docs + +writable images should be listed first < 10, so their id corresponds to RAT(id+1) +writable images will consume TEX slots, VTX slots too because of linear indexing + +*/ + +const struct u_resource_vtbl r600_global_buffer_vtbl = +{ + u_default_resource_get_handle, /* get_handle */ + r600_compute_global_buffer_destroy, /* resource_destroy */ + r600_compute_global_get_transfer, /* get_transfer */ + r600_compute_global_transfer_destroy, /* transfer_destroy */ + r600_compute_global_transfer_map, /* transfer_map */ + r600_compute_global_transfer_flush_region,/* transfer_flush_region */ + r600_compute_global_transfer_unmap, /* transfer_unmap */ + r600_compute_global_transfer_inline_write /* transfer_inline_write */ +}; + + +void *evergreen_create_compute_state( + struct pipe_context *ctx_, + const const struct pipe_compute_state *cso) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + +#ifdef HAVE_OPENCL + const struct pipe_llvm_program_header * header; + const unsigned char * code; + + header = cso->prog; + code = cso->prog + sizeof(struct pipe_llvm_program_header); +#endif + + if (!ctx->screen->screen.get_param(&ctx->screen->screen, + PIPE_CAP_COMPUTE)) { + fprintf(stderr, "Compute is not supported\n"); + return NULL; + } + struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute); + + shader->ctx = (struct r600_context*)ctx; + shader->resources = (struct evergreen_compute_resource*) + CALLOC(sizeof(struct evergreen_compute_resource), + get_compute_resource_num()); + shader->local_size = cso->req_local_mem; ///TODO: assert it + shader->private_size = cso->req_private_mem; + shader->input_size = cso->req_input_mem; + +#ifdef HAVE_OPENCL + shader->mod = llvm_parse_bitcode(code, header->num_bytes); + + r600_compute_shader_create(ctx_, shader->mod, &shader->bc); +#endif + return shader; +} + +void evergreen_delete_compute_state(struct pipe_context *ctx, void* state) +{ + struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state; + + free(shader->resources); + free(shader); +} + +static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + + ctx->cs_shader = (struct r600_pipe_compute *)state; + + assert(!ctx->cs_shader->shader_code_bo); + + ctx->cs_shader->shader_code_bo = + r600_compute_buffer_alloc_vram(ctx->screen, + ctx->cs_shader->bc.ndw * 4); + + void *p = ctx->ws->buffer_map(ctx->cs_shader->shader_code_bo->cs_buf, + ctx->cs, PIPE_TRANSFER_WRITE); + + memcpy(p, ctx->cs_shader->bc.bytecode, ctx->cs_shader->bc.ndw * 4); + + ctx->ws->buffer_unmap(ctx->cs_shader->shader_code_bo->cs_buf); + + evergreen_compute_init_config(ctx); + + struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader, + COMPUTE_RESOURCE_SHADER, 0); + + evergreen_reg_set(res, R_008C0C_SQ_GPR_RESOURCE_MGMT_3, + S_008C0C_NUM_LS_GPRS(ctx->cs_shader->bc.ngpr)); + + ///maybe we can use it later + evergreen_reg_set(res, R_0286C8_SPI_THREAD_GROUPING, 0); + ///maybe we can use it later + evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0); + + evergreen_reg_set(res, R_0288D4_SQ_PGM_RESOURCES_LS, + S_0288D4_NUM_GPRS(ctx->cs_shader->bc.ngpr) + | S_0288D4_STACK_SIZE(ctx->cs_shader->bc.nstack)); + evergreen_reg_set(res, R_0288D8_SQ_PGM_RESOURCES_LS_2, 0); + + evergreen_reg_set(res, R_0288D0_SQ_PGM_START_LS, 0); + res->bo = ctx->cs_shader->shader_code_bo; + res->usage = RADEON_USAGE_READ; + res->coher_bo_size = ctx->cs_shader->bc.ndw*4; + res->flags = COMPUTE_RES_SH_FLUSH; + + /* We can't always determine the + * number of iterations in a loop before it's executed, + * so we just need to set up the loop counter to give us the maximum + * number of iterations possible. Currently, loops in shader code + * ignore the loop counter and use a break instruction to exit the + * loop at the correct time. + */ + evergreen_set_loop_const(ctx->cs_shader, + 0, /* index */ + 0xFFF, /* Maximum value of the loop counter (i.e. when the loop + * counter reaches this value, the program will break + * out of the loop. */ + 0x0, /* Starting value of the loop counter. */ + 0x1); /* Amount to increment the loop counter each iteration. */ +} + +/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit + * kernel parameters there are inplicit parameters that need to be stored + * in the vertex buffer as well. Here is how these parameters are organized in + * the buffer: + * + * DWORDS 0-2: Number of work groups in each dimension (x,y,z) + * DWORDS 3-5: Number of global work items in each dimension (x,y,z) + * DWORDS 6-8: Number of work items within each work group in each dimension + * (x,y,z) + * DWORDS 9+ : Kernel parameters + */ +void evergreen_compute_upload_input( + struct pipe_context *ctx_, + const uint *block_layout, + const uint *grid_layout, + const void *input) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + int i; + unsigned kernel_parameters_offset_bytes = 36; + uint32_t * num_work_groups_start; + uint32_t * global_size_start; + uint32_t * local_size_start; + uint32_t * kernel_parameters_start; + + if (ctx->cs_shader->input_size == 0) { + return; + } + + if (!ctx->cs_shader->kernel_param) { + unsigned buffer_size = ctx->cs_shader->input_size; + + /* Add space for the grid dimensions */ + buffer_size += kernel_parameters_offset_bytes * sizeof(uint); + ctx->cs_shader->kernel_param = + r600_compute_buffer_alloc_vram(ctx->screen, + buffer_size); + } + + num_work_groups_start = ctx->ws->buffer_map( + ctx->cs_shader->kernel_param->cs_buf, + ctx->cs, PIPE_TRANSFER_WRITE); + global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4)); + local_size_start = global_size_start + (3 * (sizeof(uint)) / 4); + kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4); + + /* Copy the work group size */ + memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint)); + + /* Copy the global size */ + for (i = 0; i < 3; i++) { + global_size_start[i] = grid_layout[i] * block_layout[i]; + } + + /* Copy the local dimensions */ + memcpy(local_size_start, block_layout, 3 * sizeof(uint)); + + /* Copy the kernel inputs */ + memcpy(kernel_parameters_start, input, ctx->cs_shader->input_size); + + for (i = 0; i < (kernel_parameters_offset_bytes / 4) + + (ctx->cs_shader->input_size / 4); i++) { + COMPUTE_DBG("input %i : %i\n", i, + ((unsigned*)num_work_groups_start)[i]); + } + + ctx->ws->buffer_unmap(ctx->cs_shader->kernel_param->cs_buf); + + ///ID=0 is reserved for the parameters + evergreen_set_vtx_resource(ctx->cs_shader, + ctx->cs_shader->kernel_param, 0, 0, 0); + ///ID=0 is reserved for parameters + evergreen_set_const_cache(ctx->cs_shader, 0, + ctx->cs_shader->kernel_param, ctx->cs_shader->input_size, 0); +} + +void evergreen_direct_dispatch( + struct pipe_context *ctx_, + const uint *block_layout, const uint *grid_layout) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + + int i; + + struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader, + COMPUTE_RESOURCE_DISPATCH, 0); + + evergreen_reg_set(res, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST); + + evergreen_reg_set(res, R_00899C_VGT_COMPUTE_START_X, 0); + evergreen_reg_set(res, R_0089A0_VGT_COMPUTE_START_Y, 0); + evergreen_reg_set(res, R_0089A4_VGT_COMPUTE_START_Z, 0); + + evergreen_reg_set(res, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, block_layout[0]); + evergreen_reg_set(res, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y, block_layout[1]); + evergreen_reg_set(res, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z, block_layout[2]); + + int group_size = 1; + + int grid_size = 1; + + for (i = 0; i < 3; i++) { + group_size *= block_layout[i]; + } + + for (i = 0; i < 3; i++) { + grid_size *= grid_layout[i]; + } + + evergreen_reg_set(res, R_008970_VGT_NUM_INDICES, group_size); + evergreen_reg_set(res, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size); + + evergreen_emit_raw_value(res, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); + evergreen_emit_raw_value(res, grid_layout[0]); + evergreen_emit_raw_value(res, grid_layout[1]); + evergreen_emit_raw_value(res, grid_layout[2]); + ///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN + evergreen_emit_raw_value(res, 1); +} + +static void compute_emit_cs(struct r600_context *ctx) +{ + struct radeon_winsys_cs *cs = ctx->cs; + int i; + + r600_emit_atom(ctx, &ctx->start_cs_cmd.atom); + + struct r600_resource *onebo = NULL; + + for (i = 0; i < get_compute_resource_num(); i++) { + if (ctx->cs_shader->resources[i].enabled) { + int j; + COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw); + + for (j = 0; j < ctx->cs_shader->resources[i].cs_end; j++) { + if (ctx->cs_shader->resources[i].do_reloc[j]) { + assert(ctx->cs_shader->resources[i].bo); + evergreen_emit_ctx_reloc(ctx, + ctx->cs_shader->resources[i].bo, + ctx->cs_shader->resources[i].usage); + } + + cs->buf[cs->cdw++] = ctx->cs_shader->resources[i].cs[j]; + } + + if (ctx->cs_shader->resources[i].bo) { + onebo = ctx->cs_shader->resources[i].bo; + evergreen_emit_ctx_reloc(ctx, + ctx->cs_shader->resources[i].bo, + ctx->cs_shader->resources[i].usage); + + ///special case for textures + if (ctx->cs_shader->resources[i].do_reloc + [ctx->cs_shader->resources[i].cs_end] == 2) { + evergreen_emit_ctx_reloc(ctx, + ctx->cs_shader->resources[i].bo, + ctx->cs_shader->resources[i].usage); + } + + evergreen_set_buffer_sync(ctx, ctx->cs_shader->resources[i].bo, + ctx->cs_shader->resources[i].coher_bo_size, + ctx->cs_shader->resources[i].flags, + ctx->cs_shader->resources[i].usage); + } + } + } + +#if 0 + COMPUTE_DBG("cdw: %i\n", cs->cdw); + for (i = 0; i < cs->cdw; i++) { + COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]); + } +#endif + + ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC); + + ctx->pm4_dirty_cdwords = 0; + ctx->flags = 0; + + COMPUTE_DBG("shader started\n"); + + ctx->ws->buffer_wait(onebo->buf, 0); + + COMPUTE_DBG("...\n"); + + r600_emit_atom(ctx, &ctx->start_cs_cmd.atom); + + ctx->streamout_start = TRUE; + ctx->streamout_append_bitmask = ~0; + +} + +static void evergreen_launch_grid( + struct pipe_context *ctx_, + const uint *block_layout, const uint *grid_layout, + uint32_t pc, const void *input) +{ + COMPUTE_DBG("PC: %i\n", pc); + + struct r600_context *ctx = (struct r600_context *)ctx_; + unsigned num_waves; + unsigned num_pipes = ctx->screen->info.r600_max_pipes; + unsigned wave_divisor = (16 * num_pipes); + + /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ + num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + + wave_divisor - 1) / wave_divisor; + + COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n", + num_pipes, num_waves); + + evergreen_set_lds(ctx->cs_shader, 0, 0, num_waves); + evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input); + evergreen_direct_dispatch(ctx_, block_layout, grid_layout); + compute_emit_cs(ctx); +} + +static void evergreen_set_compute_resources(struct pipe_context * ctx_, + unsigned start, unsigned count, + struct pipe_surface ** surfaces) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + struct r600_surface **resources = (struct r600_surface **)surfaces; + for (int i = 0; i < count; i++) { + if (resources[i]) { + struct r600_resource_global *buffer = + (struct r600_resource_global*)resources[i]->base.texture; + if (resources[i]->base.writable) { + assert(i+1 < 12); + struct r600_resource_global *buffer = + (struct r600_resource_global*) + resources[i]->base.texture; + + evergreen_set_rat(ctx->cs_shader, i+1, + (struct r600_resource *)resources[i]->base.texture, + buffer->chunk->start_in_dw*4, + resources[i]->base.texture->width0); + } + + evergreen_set_vtx_resource(ctx->cs_shader, + (struct r600_resource *)resources[i]->base.texture, i+2, + buffer->chunk->start_in_dw*4, resources[i]->base.writable); + } + } + +} + +static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_, + unsigned start_slot, unsigned count, + struct pipe_sampler_view **views) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + struct r600_pipe_sampler_view **resource = + (struct r600_pipe_sampler_view **)views; + + for (int i = 0; i < count; i++) { + if (resource[i]) { + assert(i+1 < 12); + ///FETCH0 = VTX0 (param buffer), + //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX + evergreen_set_tex_resource(ctx->cs_shader, resource[i], i+2); + } + } +} + +static void evergreen_bind_compute_sampler_states( + struct pipe_context *ctx_, + unsigned start_slot, + unsigned num_samplers, + void **samplers_) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + struct compute_sampler_state ** samplers = + (struct compute_sampler_state **)samplers_; + + for (int i = 0; i < num_samplers; i++) { + if (samplers[i]) { + evergreen_set_sampler_resource(ctx->cs_shader, samplers[i], i); + } + } +} + +static void evergreen_set_global_binding( + struct pipe_context *ctx_, unsigned first, unsigned n, + struct pipe_resource **resources, + uint32_t **handles) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + struct compute_memory_pool *pool = ctx->screen->global_pool; + struct r600_resource_global **buffers = + (struct r600_resource_global **)resources; + + if (!resources) { + /* XXX: Unset */ + return; + } + + compute_memory_finalize_pending(pool, ctx_); + + for (int i = 0; i < n; i++) + { + assert(resources[i]->target == PIPE_BUFFER); + assert(resources[i]->bind & PIPE_BIND_GLOBAL); + + *(handles[i]) = buffers[i]->chunk->start_in_dw * 4; + } + + evergreen_set_rat(ctx->cs_shader, 0, pool->bo, 0, pool->size_in_dw * 4); + evergreen_set_vtx_resource(ctx->cs_shader, pool->bo, 1, 0, 1); +} + + +void evergreen_compute_init_config(struct r600_context *ctx) +{ + struct evergreen_compute_resource* res = + get_empty_res(ctx->cs_shader, COMPUTE_RESOURCE_CONFIG, 0); + + int num_threads; + int num_stack_entries; + int num_temp_gprs; + + enum radeon_family family; + unsigned tmp; + + family = ctx->family; + + switch (family) { + case CHIP_CEDAR: + default: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 256; + break; + case CHIP_REDWOOD: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 256; + break; + case CHIP_JUNIPER: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 512; + break; + case CHIP_CYPRESS: + case CHIP_HEMLOCK: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 512; + break; + case CHIP_PALM: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 256; + break; + case CHIP_SUMO: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 256; + break; + case CHIP_SUMO2: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 512; + break; + case CHIP_BARTS: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 512; + break; + case CHIP_TURKS: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 256; + break; + case CHIP_CAICOS: + num_temp_gprs = 4; + num_threads = 128; + num_stack_entries = 256; + break; + } + + tmp = 0x00000000; + switch (family) { + case CHIP_CEDAR: + case CHIP_PALM: + case CHIP_SUMO: + case CHIP_SUMO2: + case CHIP_CAICOS: + break; + default: + tmp |= S_008C00_VC_ENABLE(1); + break; + } + tmp |= S_008C00_EXPORT_SRC_C(1); + tmp |= S_008C00_CS_PRIO(0); + tmp |= S_008C00_LS_PRIO(0); + tmp |= S_008C00_HS_PRIO(0); + tmp |= S_008C00_PS_PRIO(0); + tmp |= S_008C00_VS_PRIO(0); + tmp |= S_008C00_GS_PRIO(0); + tmp |= S_008C00_ES_PRIO(0); + + evergreen_reg_set(res, R_008C00_SQ_CONFIG, tmp); + + evergreen_reg_set(res, R_008C04_SQ_GPR_RESOURCE_MGMT_1, + S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs)); + evergreen_reg_set(res, R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0); + evergreen_reg_set(res, R_008C10_SQ_GLOBAL_GPR_RESOURCE_MGMT_1, 0); + evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0); + evergreen_reg_set(res, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8)); + /* workaround for hw issues with dyn gpr - must set all limits to 240 + * instead of 0, 0x1e == 240/8 */ + evergreen_reg_set(res, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, + S_028838_PS_GPRS(0x1e) | + S_028838_VS_GPRS(0x1e) | + S_028838_GS_GPRS(0x1e) | + S_028838_ES_GPRS(0x1e) | + S_028838_HS_GPRS(0x1e) | + S_028838_LS_GPRS(0x1e)); + + + evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF); + evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF); + evergreen_reg_set(res, R_008E28_SQ_STATIC_THREAD_MGMT3, 0xFFFFFFFF); + evergreen_reg_set(res, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 0); + tmp = S_008C1C_NUM_LS_THREADS(num_threads); + evergreen_reg_set(res, R_008C1C_SQ_THREAD_RESOURCE_MGMT_2, tmp); + evergreen_reg_set(res, R_008C20_SQ_STACK_RESOURCE_MGMT_1, 0); + evergreen_reg_set(res, R_008C24_SQ_STACK_RESOURCE_MGMT_2, 0); + tmp = S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries); + evergreen_reg_set(res, R_008C28_SQ_STACK_RESOURCE_MGMT_3, tmp); + evergreen_reg_set(res, R_0286CC_SPI_PS_IN_CONTROL_0, S_0286CC_LINEAR_GRADIENT_ENA(1)); + evergreen_reg_set(res, R_0286D0_SPI_PS_IN_CONTROL_1, 0); + evergreen_reg_set(res, R_0286E4_SPI_PS_IN_CONTROL_2, 0); + evergreen_reg_set(res, R_0286D8_SPI_INPUT_Z, 0); + evergreen_reg_set(res, R_0286E0_SPI_BARYC_CNTL, 1 << 20); + tmp = S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK; + evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, tmp); + tmp = S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1); + evergreen_reg_set(res, R_028A40_VGT_GS_MODE, tmp); + evergreen_reg_set(res, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); + evergreen_reg_set(res, R_028800_DB_DEPTH_CONTROL, 0); + evergreen_reg_set(res, R_02880C_DB_SHADER_CONTROL, 0); + evergreen_reg_set(res, R_028000_DB_RENDER_CONTROL, S_028000_COLOR_DISABLE(1)); + evergreen_reg_set(res, R_02800C_DB_RENDER_OVERRIDE, 0); + evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, + S_0286E8_TID_IN_GROUP_ENA + | S_0286E8_TGID_ENA + | S_0286E8_DISABLE_INDEX_PACK) + ; +} + +void evergreen_init_compute_state_functions(struct r600_context *ctx) +{ + ctx->context.create_compute_state = evergreen_create_compute_state; + ctx->context.delete_compute_state = evergreen_delete_compute_state; + ctx->context.bind_compute_state = evergreen_bind_compute_state; +// ctx->context.create_sampler_view = evergreen_compute_create_sampler_view; + ctx->context.set_compute_resources = evergreen_set_compute_resources; + ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view; + ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states; + ctx->context.set_global_binding = evergreen_set_global_binding; + ctx->context.launch_grid = evergreen_launch_grid; +} + + +struct pipe_resource *r600_compute_global_buffer_create( + struct pipe_screen *screen, + const struct pipe_resource *templ) +{ + assert(templ->target == PIPE_BUFFER); + assert(templ->bind & PIPE_BIND_GLOBAL); + assert(templ->array_size == 1 || templ->array_size == 0); + assert(templ->depth0 == 1 || templ->depth0 == 0); + assert(templ->height0 == 1 || templ->height0 == 0); + + struct r600_resource_global* result = (struct r600_resource_global*) + CALLOC(sizeof(struct r600_resource_global), 1); + struct r600_screen* rscreen = (struct r600_screen*)screen; + + result->base.b.vtbl = &r600_global_buffer_vtbl; + result->base.b.b.screen = screen; + result->base.b.b = *templ; + pipe_reference_init(&result->base.b.b.reference, 1); + + int size_in_dw = (templ->width0+3) / 4; + + result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw); + + if (result->chunk == NULL) + { + free(result); + return NULL; + } + + return &result->base.b.b; +} + +void r600_compute_global_buffer_destroy( + struct pipe_screen *screen, + struct pipe_resource *res) +{ + assert(res->target == PIPE_BUFFER); + assert(res->bind & PIPE_BIND_GLOBAL); + + struct r600_resource_global* buffer = (struct r600_resource_global*)res; + struct r600_screen* rscreen = (struct r600_screen*)screen; + + compute_memory_free(rscreen->global_pool, buffer->chunk->id); + + buffer->chunk = NULL; + free(res); +} + +void* r600_compute_global_transfer_map( + struct pipe_context *ctx_, + struct pipe_transfer* transfer) +{ + assert(transfer->resource->target == PIPE_BUFFER); + assert(transfer->resource->bind & PIPE_BIND_GLOBAL); + assert(transfer->box.x >= 0); + assert(transfer->box.y == 0); + assert(transfer->box.z == 0); + + struct r600_context *ctx = (struct r600_context *)ctx_; + struct r600_resource_global* buffer = + (struct r600_resource_global*)transfer->resource; + + uint32_t* map; + ///TODO: do it better, mapping is not possible if the pool is too big + + if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf, + ctx->cs, transfer->usage))) { + return NULL; + } + + COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw); + return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x; +} + +void r600_compute_global_transfer_unmap( + struct pipe_context *ctx_, + struct pipe_transfer* transfer) +{ + assert(transfer->resource->target == PIPE_BUFFER); + assert(transfer->resource->bind & PIPE_BIND_GLOBAL); + + struct r600_context *ctx = (struct r600_context *)ctx_; + struct r600_resource_global* buffer = + (struct r600_resource_global*)transfer->resource; + + ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf); +} + +struct pipe_transfer * r600_compute_global_get_transfer( + struct pipe_context *ctx_, + struct pipe_resource *resource, + unsigned level, + unsigned usage, + const struct pipe_box *box) +{ + struct r600_context *ctx = (struct r600_context *)ctx_; + struct compute_memory_pool *pool = ctx->screen->global_pool; + + compute_memory_finalize_pending(pool, ctx_); + + assert(resource->target == PIPE_BUFFER); + struct r600_context *rctx = (struct r600_context*)ctx_; + struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers); + + transfer->resource = resource; + transfer->level = level; + transfer->usage = usage; + transfer->box = *box; + transfer->stride = 0; + transfer->layer_stride = 0; + transfer->data = NULL; + + /* Note strides are zero, this is ok for buffers, but not for + * textures 2d & higher at least. + */ + return transfer; +} + +void r600_compute_global_transfer_destroy( + struct pipe_context *ctx_, + struct pipe_transfer *transfer) +{ + struct r600_context *rctx = (struct r600_context*)ctx_; + util_slab_free(&rctx->pool_transfers, transfer); +} + +void r600_compute_global_transfer_flush_region( + struct pipe_context *ctx_, + struct pipe_transfer *transfer, + const struct pipe_box *box) +{ + assert(0 && "TODO"); +} + +void r600_compute_global_transfer_inline_write( + struct pipe_context *pipe, + struct pipe_resource *resource, + unsigned level, + unsigned usage, + const struct pipe_box *box, + const void *data, + unsigned stride, + unsigned layer_stride) +{ + assert(0 && "TODO"); +} diff --git a/src/gallium/drivers/r600/evergreen_compute.h b/src/gallium/drivers/r600/evergreen_compute.h new file mode 100644 index 00000000000..a0881cde7db --- /dev/null +++ b/src/gallium/drivers/r600/evergreen_compute.h @@ -0,0 +1,69 @@ +/* + * Copyright 2011 Adam Rak + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Adam Rak + */ + +#ifndef EVERGREEN_COMPUTE_H +#define EVERGREEN_COMPUTE_H +#include "r600.h" +#include "r600_pipe.h" + +struct evergreen_compute_resource; + +void *evergreen_create_compute_state(struct pipe_context *ctx, const const struct pipe_compute_state *cso); +void evergreen_delete_compute_state(struct pipe_context *ctx, void *state); +void evergreen_direct_dispatch( struct pipe_context *context, const uint *block_layout, const uint *grid_layout); +void evergreen_compute_upload_input(struct pipe_context *context, const uint *block_layout, const uint *grid_layout, const void *input); +void evergreen_compute_init_config(struct r600_context *rctx); +void evergreen_init_compute_state_functions(struct r600_context *rctx); + +struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen, const struct pipe_resource *templ); +void r600_compute_global_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *res); +void* r600_compute_global_transfer_map(struct pipe_context *ctx, struct pipe_transfer* transfer); +void r600_compute_global_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer* transfer); +struct pipe_transfer * r600_compute_global_get_transfer(struct pipe_context *, struct pipe_resource *, unsigned level, + unsigned usage, const struct pipe_box *); +void r600_compute_global_transfer_destroy(struct pipe_context *, struct pipe_transfer *); +void r600_compute_global_transfer_flush_region( struct pipe_context *, struct pipe_transfer *, const struct pipe_box *); +void r600_compute_global_transfer_inline_write( struct pipe_context *, struct pipe_resource *, unsigned level, + unsigned usage, const struct pipe_box *, const void *data, unsigned stride, unsigned layer_stride); + + +static inline void COMPUTE_DBG(const char *fmt, ...) +{ + static bool check_debug = false, debug = false; + + if (!check_debug) { + debug = debug_get_bool_option("R600_COMPUTE_DEBUG", FALSE); + } + + if (debug) { + va_list ap; + va_start(ap, fmt); + _debug_vprintf(fmt, ap); + va_end(ap); + } +} + +#endif diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.c b/src/gallium/drivers/r600/evergreen_compute_internal.c new file mode 100644 index 00000000000..209f064d1de --- /dev/null +++ b/src/gallium/drivers/r600/evergreen_compute_internal.c @@ -0,0 +1,830 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Adam Rak + */ + +#include +#include + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "util/u_blitter.h" +#include "util/u_double_list.h" +#include "util/u_transfer.h" +#include "util/u_surface.h" +#include "util/u_pack_color.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_framebuffer.h" +#include "r600.h" +#include "r600_resource.h" +#include "r600_shader.h" +#include "r600_pipe.h" +#include "r600_formats.h" +#include "evergreend.h" +#include "evergreen_compute_internal.h" +#include "r600_hw_context_priv.h" + +int get_compute_resource_num(void) +{ + int num = 0; +#define DECL_COMPUTE_RESOURCE(name, n) num += n; +#include "compute_resource.def" +#undef DECL_COMPUTE_RESOURCE + return num; +} + +void evergreen_emit_raw_value( + struct evergreen_compute_resource* res, + unsigned value) +{ + res->cs[res->cs_end++] = value; +} + +void evergreen_emit_ctx_value(struct r600_context *ctx, unsigned value) +{ + ctx->cs->buf[ctx->cs->cdw++] = value; +} + +void evergreen_mult_reg_set_( + struct evergreen_compute_resource* res, + int index, + u32* array, + int size) +{ + int i = 0; + + evergreen_emit_raw_reg_set(res, index, size / 4); + + for (i = 0; i < size; i+=4) { + res->cs[res->cs_end++] = array[i / 4]; + } +} + +void evergreen_reg_set( + struct evergreen_compute_resource* res, + unsigned index, + unsigned value) +{ + evergreen_emit_raw_reg_set(res, index, 1); + res->cs[res->cs_end++] = value; +} + +struct evergreen_compute_resource* get_empty_res( + struct r600_pipe_compute* pipe, + enum evergreen_compute_resources res_code, + int offset_index) +{ + int code_index = -1; + int code_size = -1; + + { + int i = 0; + #define DECL_COMPUTE_RESOURCE(name, n) if (COMPUTE_RESOURCE_ ## name == res_code) {code_index = i; code_size = n;} i += n; + #include "compute_resource.def" + #undef DECL_COMPUTE_RESOURCE + } + + assert(code_index != -1 && "internal error: resouce index not found"); + assert(offset_index < code_size && "internal error: overindexing resource"); + + int index = code_index + offset_index; + + struct evergreen_compute_resource* res = &pipe->resources[index]; + + res->enabled = true; + res->bo = NULL; + res->cs_end = 0; + bzero(&res->do_reloc, sizeof(res->do_reloc)); + + return res; +} + +void evergreen_emit_raw_reg_set( + struct evergreen_compute_resource* res, + unsigned index, + int num) +{ + res->enabled = 1; + int cs_end = res->cs_end; + + if (index >= EVERGREEN_CONFIG_REG_OFFSET + && index < EVERGREEN_CONFIG_REG_END) { + res->cs[cs_end] = PKT3C(PKT3_SET_CONFIG_REG, num, 0); + res->cs[cs_end+1] = (index - EVERGREEN_CONFIG_REG_OFFSET) >> 2; + } else if (index >= EVERGREEN_CONTEXT_REG_OFFSET + && index < EVERGREEN_CONTEXT_REG_END) { + res->cs[cs_end] = PKT3C(PKT3_SET_CONTEXT_REG, num, 0); + res->cs[cs_end+1] = (index - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; + } else if (index >= EVERGREEN_RESOURCE_OFFSET + && index < EVERGREEN_RESOURCE_END) { + res->cs[cs_end] = PKT3C(PKT3_SET_RESOURCE, num, 0); + res->cs[cs_end+1] = (index - EVERGREEN_RESOURCE_OFFSET) >> 2; + } else if (index >= EVERGREEN_SAMPLER_OFFSET + && index < EVERGREEN_SAMPLER_END) { + res->cs[cs_end] = PKT3C(PKT3_SET_SAMPLER, num, 0); + res->cs[cs_end+1] = (index - EVERGREEN_SAMPLER_OFFSET) >> 2; + } else if (index >= EVERGREEN_CTL_CONST_OFFSET + && index < EVERGREEN_CTL_CONST_END) { + res->cs[cs_end] = PKT3C(PKT3_SET_CTL_CONST, num, 0); + res->cs[cs_end+1] = (index - EVERGREEN_CTL_CONST_OFFSET) >> 2; + } else if (index >= EVERGREEN_LOOP_CONST_OFFSET + && index < EVERGREEN_LOOP_CONST_END) { + res->cs[cs_end] = PKT3C(PKT3_SET_LOOP_CONST, num, 0); + res->cs[cs_end+1] = (index - EVERGREEN_LOOP_CONST_OFFSET) >> 2; + } else if (index >= EVERGREEN_BOOL_CONST_OFFSET + && index < EVERGREEN_BOOL_CONST_END) { + res->cs[cs_end] = PKT3C(PKT3_SET_BOOL_CONST, num, 0); + res->cs[cs_end+1] = (index - EVERGREEN_BOOL_CONST_OFFSET) >> 2; + } else { + res->cs[cs_end] = PKT0(index, num-1); + res->cs_end--; + } + + res->cs_end += 2; +} + +void evergreen_emit_force_reloc(struct evergreen_compute_resource* res) +{ + res->do_reloc[res->cs_end] += 1; +} + +void evergreen_emit_ctx_reg_set( + struct r600_context *ctx, + unsigned index, + int num) +{ + + if (index >= EVERGREEN_CONFIG_REG_OFFSET + && index < EVERGREEN_CONFIG_REG_END) { + ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CONFIG_REG, num, 0); + ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CONFIG_REG_OFFSET) >> 2; + } else if (index >= EVERGREEN_CONTEXT_REG_OFFSET + && index < EVERGREEN_CONTEXT_REG_END) { + ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CONTEXT_REG, num, 0); + ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; + } else if (index >= EVERGREEN_RESOURCE_OFFSET + && index < EVERGREEN_RESOURCE_END) { + ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_RESOURCE, num, 0); + ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_RESOURCE_OFFSET) >> 2; + } else if (index >= EVERGREEN_SAMPLER_OFFSET + && index < EVERGREEN_SAMPLER_END) { + ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_SAMPLER, num, 0); + ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_SAMPLER_OFFSET) >> 2; + } else if (index >= EVERGREEN_CTL_CONST_OFFSET + && index < EVERGREEN_CTL_CONST_END) { + ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CTL_CONST, num, 0); + ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CTL_CONST_OFFSET) >> 2; + } else if (index >= EVERGREEN_LOOP_CONST_OFFSET + && index < EVERGREEN_LOOP_CONST_END) { + ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_LOOP_CONST, num, 0); + ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_LOOP_CONST_OFFSET) >> 2; + } else if (index >= EVERGREEN_BOOL_CONST_OFFSET + && index < EVERGREEN_BOOL_CONST_END) { + ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_BOOL_CONST, num, 0); + ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_BOOL_CONST_OFFSET) >> 2; + } else { + ctx->cs->buf[ctx->cs->cdw++] = PKT0(index, num-1); + } +} + +void evergreen_emit_ctx_reloc( + struct r600_context *ctx, + struct r600_resource *bo, + enum radeon_bo_usage usage) +{ + assert(bo); + + ctx->cs->buf[ctx->cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + u32 rr = r600_context_bo_reloc(ctx, bo, usage); + ctx->cs->buf[ctx->cs->cdw++] = rr; +} + +void evergreen_set_buffer_sync( + struct r600_context *ctx, + struct r600_resource* bo, + int size, + int flags, + enum radeon_bo_usage usage) +{ + assert(bo); + int32_t cp_coher_size = 0; + + if (size == 0xffffffff || size == 0) { + cp_coher_size = 0xffffffff; + } + else { + cp_coher_size = ((size + 255) >> 8); + } + + uint32_t sync_flags = 0; + + if ((flags & COMPUTE_RES_TC_FLUSH) == COMPUTE_RES_TC_FLUSH) { + sync_flags |= S_0085F0_TC_ACTION_ENA(1); + } + + if ((flags & COMPUTE_RES_VC_FLUSH) == COMPUTE_RES_VC_FLUSH) { + sync_flags |= S_0085F0_VC_ACTION_ENA(1); + } + + if ((flags & COMPUTE_RES_SH_FLUSH) == COMPUTE_RES_SH_FLUSH) { + sync_flags |= S_0085F0_SH_ACTION_ENA(1); + } + + if ((flags & COMPUTE_RES_CB_FLUSH(0)) == COMPUTE_RES_CB_FLUSH(0)) { + sync_flags |= S_0085F0_CB_ACTION_ENA(1); + + switch((flags >> 8) & 0xF) { + case 0: + sync_flags |= S_0085F0_CB0_DEST_BASE_ENA(1); + break; + case 1: + sync_flags |= S_0085F0_CB1_DEST_BASE_ENA(1); + break; + case 2: + sync_flags |= S_0085F0_CB2_DEST_BASE_ENA(1); + break; + case 3: + sync_flags |= S_0085F0_CB3_DEST_BASE_ENA(1); + break; + case 4: + sync_flags |= S_0085F0_CB4_DEST_BASE_ENA(1); + break; + case 5: + sync_flags |= S_0085F0_CB5_DEST_BASE_ENA(1); + break; + case 6: + sync_flags |= S_0085F0_CB6_DEST_BASE_ENA(1); + break; + case 7: + sync_flags |= S_0085F0_CB7_DEST_BASE_ENA(1); + break; + case 8: + sync_flags |= S_0085F0_CB8_DEST_BASE_ENA(1); + break; + case 9: + sync_flags |= S_0085F0_CB9_DEST_BASE_ENA(1); + break; + case 10: + sync_flags |= S_0085F0_CB10_DEST_BASE_ENA(1); + break; + case 11: + sync_flags |= S_0085F0_CB11_DEST_BASE_ENA(1); + break; + default: + assert(0); + } + } + + int32_t poll_interval = 10; + + ctx->cs->buf[ctx->cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0); + ctx->cs->buf[ctx->cs->cdw++] = sync_flags; + ctx->cs->buf[ctx->cs->cdw++] = cp_coher_size; + ctx->cs->buf[ctx->cs->cdw++] = 0; + ctx->cs->buf[ctx->cs->cdw++] = poll_interval; + + if (cp_coher_size != 0xffffffff) { + evergreen_emit_ctx_reloc(ctx, bo, usage); + } +} + +int evergreen_compute_get_gpu_format( + struct number_type_and_format* fmt, + struct r600_resource *bo) +{ + switch (bo->b.b.format) + { + case PIPE_FORMAT_R8_UNORM: + case PIPE_FORMAT_R32_UNORM: + fmt->format = V_028C70_COLOR_32; + fmt->number_type = V_028C70_NUMBER_UNORM; + fmt->num_format_all = 0; + break; + case PIPE_FORMAT_R32_FLOAT: + fmt->format = V_028C70_COLOR_32_FLOAT; + fmt->number_type = V_028C70_NUMBER_FLOAT; + fmt->num_format_all = 0; + break; + case PIPE_FORMAT_R32G32B32A32_FLOAT: + fmt->format = V_028C70_COLOR_32_32_32_32_FLOAT; + fmt->number_type = V_028C70_NUMBER_FLOAT; + fmt->num_format_all = 0; + break; + + ///TODO: other formats... + + default: + return 0; + } + + return 1; +} + +void evergreen_set_rat( + struct r600_pipe_compute *pipe, + int id, + struct r600_resource* bo, + int start, + int size) +{ + assert(id < 12); + assert((size & 3) == 0); + assert((start & 0xFF) == 0); + + int offset; + COMPUTE_DBG("bind rat: %i \n", id); + + if (id < 8) { + offset = id*0x3c; + } + else { + offset = 8*0x3c + (id-8)*0x1c; + } + + int linear = 0; + + if (bo->b.b.height0 <= 1 && bo->b.b.depth0 <= 1 + && bo->b.b.target == PIPE_BUFFER) { + linear = 1; + } + + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_RAT, id); + + evergreen_emit_force_reloc(res); + + evergreen_reg_set(res, R_028C64_CB_COLOR0_PITCH, 0); ///TODO: for 2D? + evergreen_reg_set(res, R_028C68_CB_COLOR0_SLICE, 0); + + struct number_type_and_format fmt; + + ///default config + if (bo->b.b.format == PIPE_FORMAT_NONE) { + fmt.format = V_028C70_COLOR_32; + fmt.number_type = V_028C70_NUMBER_FLOAT; + } else { + evergreen_compute_get_gpu_format(&fmt, bo); + } + + if (linear) { + evergreen_reg_set(res, + R_028C70_CB_COLOR0_INFO, S_028C70_RAT(1) + | S_028C70_ARRAY_MODE(V_028C70_ARRAY_LINEAR_ALIGNED) + | S_028C70_FORMAT(fmt.format) + | S_028C70_NUMBER_TYPE(fmt.number_type) + ); + evergreen_emit_force_reloc(res); + } else { + assert(0 && "TODO"); + ///TODO +// evergreen_reg_set(res, R_028C70_CB_COLOR0_INFO, S_028C70_RAT(1) | S_028C70_ARRAY_MODE(????)); +// evergreen_emit_force_reloc(res); + } + + evergreen_reg_set(res, R_028C74_CB_COLOR0_ATTRIB, S_028C74_NON_DISP_TILING_ORDER(1)); + evergreen_emit_force_reloc(res); + + if (linear) { + /* XXX: Why are we using size instead of bo->b.b.b.width0 ? */ + evergreen_reg_set(res, R_028C78_CB_COLOR0_DIM, size); + } else { + evergreen_reg_set(res, R_028C78_CB_COLOR0_DIM, + S_028C78_WIDTH_MAX(bo->b.b.width0) + | S_028C78_HEIGHT_MAX(bo->b.b.height0)); + } + + if (id < 8) { + evergreen_reg_set(res, R_028C7C_CB_COLOR0_CMASK, 0); + evergreen_emit_force_reloc(res); + evergreen_reg_set(res, R_028C84_CB_COLOR0_FMASK, 0); + evergreen_emit_force_reloc(res); + } + + evergreen_reg_set(res, R_028C60_CB_COLOR0_BASE + offset, start >> 8); + + res->bo = bo; + res->usage = RADEON_USAGE_READWRITE; + res->coher_bo_size = size; + res->flags = COMPUTE_RES_CB_FLUSH(id); +} + +void evergreen_set_lds( + struct r600_pipe_compute *pipe, + int num_lds, + int size, + int num_waves) +{ + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_LDS, 0); + + evergreen_reg_set(res, R_008E2C_SQ_LDS_RESOURCE_MGMT, + S_008E2C_NUM_LS_LDS(num_lds)); + evergreen_reg_set(res, CM_R_0288E8_SQ_LDS_ALLOC, size | num_waves << 14); +} + +void evergreen_set_gds( + struct r600_pipe_compute *pipe, + uint32_t addr, + uint32_t size) +{ + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_GDS, 0); + + evergreen_reg_set(res, R_028728_GDS_ORDERED_WAVE_PER_SE, 1); + evergreen_reg_set(res, R_028720_GDS_ADDR_BASE, addr); + evergreen_reg_set(res, R_028724_GDS_ADDR_SIZE, size); +} + +void evergreen_set_export( + struct r600_pipe_compute *pipe, + struct r600_resource* bo, + int offset, int size) +{ + #define SX_MEMORY_EXPORT_BASE 0x9010 + #define SX_MEMORY_EXPORT_SIZE 0x9014 + + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_EXPORT, 0); + + evergreen_reg_set(res, SX_MEMORY_EXPORT_SIZE, size); + + if (size) { + evergreen_reg_set(res, SX_MEMORY_EXPORT_BASE, offset); + res->bo = bo; + res->usage = RADEON_USAGE_WRITE; + res->coher_bo_size = size; + res->flags = 0; + } +} + +void evergreen_set_loop_const( + struct r600_pipe_compute *pipe, + int id, int count, int init, int inc) { + + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_LOOP, id); + + assert(id < 32); + assert(count <= 0xFFF); + assert(init <= 0xFF); + assert(inc <= 0xFF); + + /* Compute shaders use LOOP_CONST registers SQ_LOOP_CONST_160 to + * SQ_LOOP_CONST_191 */ + evergreen_reg_set(res, R_03A200_SQ_LOOP_CONST_0 + (160 * 4) + (id * 4), + count | init << 12 | inc << 24); +} + +void evergreen_set_tmp_ring( + struct r600_pipe_compute *pipe, + struct r600_resource* bo, + int offset, int size, int se) +{ + #define SQ_LSTMP_RING_BASE 0x00008e10 + #define SQ_LSTMP_RING_SIZE 0x00008e14 + #define GRBM_GFX_INDEX 0x802C + #define INSTANCE_INDEX(x) ((x) << 0) + #define SE_INDEX(x) ((x) << 16) + #define INSTANCE_BROADCAST_WRITES (1 << 30) + #define SE_BROADCAST_WRITES (1 << 31) + + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_TMPRING, se); + + evergreen_reg_set(res, + GRBM_GFX_INDEX,INSTANCE_INDEX(0) + | SE_INDEX(se) + | INSTANCE_BROADCAST_WRITES); + evergreen_reg_set(res, SQ_LSTMP_RING_SIZE, size); + + if (size) { + assert(bo); + + evergreen_reg_set(res, SQ_LSTMP_RING_BASE, offset); + res->bo = bo; + res->usage = RADEON_USAGE_WRITE; + res->coher_bo_size = 0; + res->flags = 0; + } + + if (size) { + evergreen_emit_force_reloc(res); + } + + evergreen_reg_set(res, + GRBM_GFX_INDEX,INSTANCE_INDEX(0) + | SE_INDEX(0) + | INSTANCE_BROADCAST_WRITES + | SE_BROADCAST_WRITES); +} + +static uint32_t r600_colorformat_endian_swap(uint32_t colorformat) +{ + if (R600_BIG_ENDIAN) { + switch(colorformat) { + case V_028C70_COLOR_4_4: + return ENDIAN_NONE; + + /* 8-bit buffers. */ + case V_028C70_COLOR_8: + return ENDIAN_NONE; + + /* 16-bit buffers. */ + case V_028C70_COLOR_5_6_5: + case V_028C70_COLOR_1_5_5_5: + case V_028C70_COLOR_4_4_4_4: + case V_028C70_COLOR_16: + case V_028C70_COLOR_8_8: + return ENDIAN_8IN16; + + /* 32-bit buffers. */ + case V_028C70_COLOR_8_8_8_8: + case V_028C70_COLOR_2_10_10_10: + case V_028C70_COLOR_8_24: + case V_028C70_COLOR_24_8: + case V_028C70_COLOR_32_FLOAT: + case V_028C70_COLOR_16_16_FLOAT: + case V_028C70_COLOR_16_16: + return ENDIAN_8IN32; + + /* 64-bit buffers. */ + case V_028C70_COLOR_16_16_16_16: + case V_028C70_COLOR_16_16_16_16_FLOAT: + return ENDIAN_8IN16; + + case V_028C70_COLOR_32_32_FLOAT: + case V_028C70_COLOR_32_32: + case V_028C70_COLOR_X24_8_32_FLOAT: + return ENDIAN_8IN32; + + /* 96-bit buffers. */ + case V_028C70_COLOR_32_32_32_FLOAT: + /* 128-bit buffers. */ + case V_028C70_COLOR_32_32_32_32_FLOAT: + case V_028C70_COLOR_32_32_32_32: + return ENDIAN_8IN32; + default: + return ENDIAN_NONE; /* Unsupported. */ + } + } else { + return ENDIAN_NONE; + } +} + +static unsigned r600_tex_dim(unsigned dim) +{ + switch (dim) { + default: + case PIPE_TEXTURE_1D: + return V_030000_SQ_TEX_DIM_1D; + case PIPE_TEXTURE_1D_ARRAY: + return V_030000_SQ_TEX_DIM_1D_ARRAY; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + return V_030000_SQ_TEX_DIM_2D; + case PIPE_TEXTURE_2D_ARRAY: + return V_030000_SQ_TEX_DIM_2D_ARRAY; + case PIPE_TEXTURE_3D: + return V_030000_SQ_TEX_DIM_3D; + case PIPE_TEXTURE_CUBE: + return V_030000_SQ_TEX_DIM_CUBEMAP; + } +} + +void evergreen_set_vtx_resource( + struct r600_pipe_compute *pipe, + struct r600_resource* bo, + int id, uint64_t offset, int writable) +{ + assert(id < 16); + uint32_t sq_vtx_constant_word2, sq_vtx_constant_word3, sq_vtx_constant_word4; + struct number_type_and_format fmt; + + fmt.format = 0; + + assert(bo->b.b.height0 <= 1); + assert(bo->b.b.depth0 <= 1); + + int e = evergreen_compute_get_gpu_format(&fmt, bo); + + assert(e && "unknown format"); + + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_VERT, id); + + unsigned size = bo->b.b.width0; + unsigned stride = 1; + +// size = (size * util_format_get_blockwidth(bo->b.b.b.format) * +// util_format_get_blocksize(bo->b.b.b.format)); + + COMPUTE_DBG("id: %i vtx size: %i byte, width0: %i elem\n", + id, size, bo->b.b.width0); + + sq_vtx_constant_word2 = + S_030008_BASE_ADDRESS_HI(offset >> 32) | + S_030008_STRIDE(stride) | + S_030008_DATA_FORMAT(fmt.format) | + S_030008_NUM_FORMAT_ALL(fmt.num_format_all) | + S_030008_ENDIAN_SWAP(0); + + COMPUTE_DBG("%08X %i %i %i %i\n", sq_vtx_constant_word2, offset, + stride, fmt.format, fmt.num_format_all); + + sq_vtx_constant_word3 = + S_03000C_DST_SEL_X(0) | + S_03000C_DST_SEL_Y(1) | + S_03000C_DST_SEL_Z(2) | + S_03000C_DST_SEL_W(3); + + sq_vtx_constant_word4 = 0; + + evergreen_emit_raw_value(res, PKT3C(PKT3_SET_RESOURCE, 8, 0)); + evergreen_emit_raw_value(res, (id+816)*32 >> 2); + evergreen_emit_raw_value(res, (unsigned)((offset) & 0xffffffff)); + evergreen_emit_raw_value(res, size - 1); + evergreen_emit_raw_value(res, sq_vtx_constant_word2); + evergreen_emit_raw_value(res, sq_vtx_constant_word3); + evergreen_emit_raw_value(res, sq_vtx_constant_word4); + evergreen_emit_raw_value(res, 0); + evergreen_emit_raw_value(res, 0); + evergreen_emit_raw_value(res, S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER)); + + res->bo = bo; + + if (writable) { + res->usage = RADEON_USAGE_READWRITE; + } + else { + res->usage = RADEON_USAGE_READ; + } + + res->coher_bo_size = size; + res->flags = COMPUTE_RES_TC_FLUSH | COMPUTE_RES_VC_FLUSH; +} + +void evergreen_set_tex_resource( + struct r600_pipe_compute *pipe, + struct r600_pipe_sampler_view* view, + int id) +{ + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_TEX, id); + struct r600_resource_texture *tmp = + (struct r600_resource_texture*)view->base.texture; + + unsigned format, endian; + uint32_t word4 = 0, yuv_format = 0, pitch = 0; + unsigned char swizzle[4], array_mode = 0, tile_type = 0; + unsigned height, depth; + + swizzle[0] = 0; + swizzle[1] = 1; + swizzle[2] = 2; + swizzle[3] = 3; + + format = r600_translate_texformat((struct pipe_screen *)pipe->ctx->screen, + view->base.format, swizzle, &word4, &yuv_format); + + if (format == ~0) { + format = 0; + } + + endian = r600_colorformat_endian_swap(format); + + height = view->base.texture->height0; + depth = view->base.texture->depth0; + + pitch = align(tmp->pitch_in_blocks[0] * + util_format_get_blockwidth(tmp->real_format), 8); + array_mode = tmp->array_mode[0]; + tile_type = tmp->tile_type; + + assert(view->base.texture->target != PIPE_TEXTURE_1D_ARRAY); + assert(view->base.texture->target != PIPE_TEXTURE_2D_ARRAY); + + evergreen_emit_raw_value(res, PKT3C(PKT3_SET_RESOURCE, 8, 0)); + evergreen_emit_raw_value(res, (id+816)*32 >> 2); ///TODO: check this line + evergreen_emit_raw_value(res, + (S_030000_DIM(r600_tex_dim(view->base.texture->target)) | + S_030000_PITCH((pitch / 8) - 1) | + S_030000_NON_DISP_TILING_ORDER(tile_type) | + S_030000_TEX_WIDTH(view->base.texture->width0 - 1))); + evergreen_emit_raw_value(res, (S_030004_TEX_HEIGHT(height - 1) | + S_030004_TEX_DEPTH(depth - 1) | + S_030004_ARRAY_MODE(array_mode))); + evergreen_emit_raw_value(res, tmp->offset[0] >> 8); + evergreen_emit_raw_value(res, tmp->offset[0] >> 8); + evergreen_emit_raw_value(res, (word4 | + S_030010_SRF_MODE_ALL(V_030010_SRF_MODE_ZERO_CLAMP_MINUS_ONE) | + S_030010_ENDIAN_SWAP(endian) | + S_030010_BASE_LEVEL(0))); + evergreen_emit_raw_value(res, (S_030014_LAST_LEVEL(0) | + S_030014_BASE_ARRAY(0) | + S_030014_LAST_ARRAY(0))); + evergreen_emit_raw_value(res, (S_030018_MAX_ANISO(4 /* max 16 samples */))); + evergreen_emit_raw_value(res, + S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_TEXTURE) + | S_03001C_DATA_FORMAT(format)); + + res->bo = (struct r600_resource*)view->base.texture; + + res->usage = RADEON_USAGE_READ; + + res->coher_bo_size = tmp->offset[0] + util_format_get_blockwidth(tmp->real_format)*view->base.texture->width0*height*depth; + res->flags = COMPUTE_RES_TC_FLUSH; + + evergreen_emit_force_reloc(res); + evergreen_emit_force_reloc(res); +} + +void evergreen_set_sampler_resource( + struct r600_pipe_compute *pipe, + struct compute_sampler_state *sampler, + int id) +{ + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_SAMPLER, id); + + unsigned aniso_flag_offset = sampler->state.max_anisotropy > 1 ? 2 : 0; + + evergreen_emit_raw_value(res, PKT3C(PKT3_SET_SAMPLER, 3, 0)); + evergreen_emit_raw_value(res, (id + 90)*3); + evergreen_emit_raw_value(res, + S_03C000_CLAMP_X(r600_tex_wrap(sampler->state.wrap_s)) | + S_03C000_CLAMP_Y(r600_tex_wrap(sampler->state.wrap_t)) | + S_03C000_CLAMP_Z(r600_tex_wrap(sampler->state.wrap_r)) | + S_03C000_XY_MAG_FILTER(r600_tex_filter(sampler->state.mag_img_filter) | aniso_flag_offset) | + S_03C000_XY_MIN_FILTER(r600_tex_filter(sampler->state.min_img_filter) | aniso_flag_offset) | + S_03C000_BORDER_COLOR_TYPE(V_03C000_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK) + ); + evergreen_emit_raw_value(res, + S_03C004_MIN_LOD(S_FIXED(CLAMP(sampler->state.min_lod, 0, 15), 8)) | + S_03C004_MAX_LOD(S_FIXED(CLAMP(sampler->state.max_lod, 0, 15), 8)) + ); + evergreen_emit_raw_value(res, + S_03C008_LOD_BIAS(S_FIXED(CLAMP(sampler->state.lod_bias, -16, 16), 8)) | + (sampler->state.seamless_cube_map ? 0 : S_03C008_DISABLE_CUBE_WRAP(1)) | + S_03C008_TYPE(1) + ); +} + +void evergreen_set_const_cache( + struct r600_pipe_compute *pipe, + int cache_id, + struct r600_resource* cbo, + int size, int offset) +{ + #define SQ_ALU_CONST_BUFFER_SIZE_LS_0 0x00028fc0 + #define SQ_ALU_CONST_CACHE_LS_0 0x00028f40 + + struct evergreen_compute_resource* res = + get_empty_res(pipe, COMPUTE_RESOURCE_CONST_MEM, cache_id); + + assert(size < 0x200); + assert((offset & 0xFF) == 0); + assert(cache_id < 16); + + evergreen_reg_set(res, SQ_ALU_CONST_BUFFER_SIZE_LS_0 + cache_id*4, size); + evergreen_reg_set(res, SQ_ALU_CONST_CACHE_LS_0 + cache_id*4, offset >> 8); + res->bo = cbo; + res->usage = RADEON_USAGE_READ; + res->coher_bo_size = size; + res->flags = COMPUTE_RES_SH_FLUSH; +} + +struct r600_resource* r600_compute_buffer_alloc_vram( + struct r600_screen *screen, + unsigned size) +{ + assert(size); + + struct pipe_resource * buffer = pipe_buffer_create( + (struct pipe_screen*) screen, + PIPE_BIND_CUSTOM, + PIPE_USAGE_IMMUTABLE, + size); + + return (struct r600_resource *)buffer; +} diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.h b/src/gallium/drivers/r600/evergreen_compute_internal.h new file mode 100644 index 00000000000..340ff4b557e --- /dev/null +++ b/src/gallium/drivers/r600/evergreen_compute_internal.h @@ -0,0 +1,119 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Adam Rak + */ + +#ifndef EVERGREEN_COMPUTE_INTERNAL_H +#define EVERGREEN_COMPUTE_INTERNAL_H + +#include "compute_memory_pool.h" + +enum evergreen_compute_resources +{ +#define DECL_COMPUTE_RESOURCE(name, n) COMPUTE_RESOURCE_ ## name , +#include "compute_resource.def" +#undef DECL_COMPUTE_RESOURCE +__COMPUTE_RESOURCE_END__ +}; + +typedef unsigned u32; + +#define COMPUTE_RES_TC_FLUSH 0xF0001 +#define COMPUTE_RES_VC_FLUSH 0xF0002 +#define COMPUTE_RES_SH_FLUSH 0xF0004 +#define COMPUTE_RES_CB_FLUSH(x) (0xF0008 | x << 8) +#define COMPUTE_RES_FULL_FLUSH 0xF0010 + +struct evergreen_compute_resource { + int enabled; + + int do_reloc[256]; + u32 cs[256]; + int cs_end; + + struct r600_resource *bo; + int coher_bo_size; + enum radeon_bo_usage usage; + int flags; ///flags for COMPUTE_RES_*_FLUSH +}; + +struct compute_sampler_state { + struct r600_pipe_state base; + struct pipe_sampler_state state; +}; + +struct number_type_and_format { + unsigned format; + unsigned number_type; + unsigned num_format_all; +}; + +struct r600_pipe_compute { + struct r600_context *ctx; + struct r600_bytecode bc; + struct tgsi_token *tokens; + + struct evergreen_compute_resource *resources; + + unsigned local_size; + unsigned private_size; + unsigned input_size; +#ifdef HAVE_OPENCL + LLVMModuleRef mod; +#endif + struct r600_resource *kernel_param; + struct r600_resource *shader_code_bo; +}; + +int evergreen_compute_get_gpu_format(struct number_type_and_format* fmt, struct r600_resource *bo); ///get hw format from resource, return 0 on faliure, nonzero on success + + +void evergreen_emit_raw_reg_set(struct evergreen_compute_resource* res, unsigned index, int num); +void evergreen_emit_ctx_reg_set(struct r600_context *ctx, unsigned index, int num); +void evergreen_emit_raw_value(struct evergreen_compute_resource* res, unsigned value); +void evergreen_emit_ctx_value(struct r600_context *ctx, unsigned value); +void evergreen_mult_reg_set_(struct evergreen_compute_resource* res, int index, u32* array, int size); +void evergreen_emit_ctx_reloc(struct r600_context *ctx, struct r600_resource *bo, enum radeon_bo_usage usage); +void evergreen_reg_set(struct evergreen_compute_resource* res, unsigned index, unsigned value); +void evergreen_emit_force_reloc(struct evergreen_compute_resource* res); + +void evergreen_set_buffer_sync(struct r600_context *ctx, struct r600_resource* bo, int size, int flags, enum radeon_bo_usage usage); + +struct evergreen_compute_resource* get_empty_res(struct r600_pipe_compute*, enum evergreen_compute_resources res_code, int index); +int get_compute_resource_num(void); + +#define evergreen_mult_reg_set(res, index, array) evergreen_mult_reg_set_(res, index, array, sizeof(array)) + +void evergreen_set_rat(struct r600_pipe_compute *pipe, int id, struct r600_resource* bo, int start, int size); +void evergreen_set_lds(struct r600_pipe_compute *pipe, int num_lds, int size, int num_waves); +void evergreen_set_gds(struct r600_pipe_compute *pipe, uint32_t addr, uint32_t size); +void evergreen_set_export(struct r600_pipe_compute *pipe, struct r600_resource* bo, int offset, int size); +void evergreen_set_loop_const(struct r600_pipe_compute *pipe, int id, int count, int init, int inc); +void evergreen_set_tmp_ring(struct r600_pipe_compute *pipe, struct r600_resource* bo, int offset, int size, int se); +void evergreen_set_vtx_resource(struct r600_pipe_compute *pipe, struct r600_resource* bo, int id, uint64_t offset, int writable); +void evergreen_set_tex_resource(struct r600_pipe_compute *pipe, struct r600_pipe_sampler_view* view, int id); +void evergreen_set_sampler_resource(struct r600_pipe_compute *pipe, struct compute_sampler_state *sampler, int id); +void evergreen_set_const_cache(struct r600_pipe_compute *pipe, int cache_id, struct r600_resource* cbo, int size, int offset); + +struct r600_resource* r600_compute_buffer_alloc_vram(struct r600_screen *screen, unsigned size); + +#endif diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index ec0afe52adf..b618ca881ba 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -28,6 +28,7 @@ #include "util/u_memory.h" #include "util/u_framebuffer.h" #include "util/u_dual_blend.h" +#include "evergreen_compute.h" static uint32_t eg_num_banks(uint32_t nbanks) { @@ -1881,6 +1882,7 @@ void evergreen_init_state_functions(struct r600_context *rctx) rctx->context.create_stream_output_target = r600_create_so_target; rctx->context.stream_output_target_destroy = r600_so_target_destroy; rctx->context.set_stream_output_targets = r600_set_so_targets; + evergreen_init_compute_state_functions(rctx); } static void cayman_init_atom_start_cs(struct r600_context *rctx) diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index 105d80f061d..3b6d7304551 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -61,6 +61,8 @@ #define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7 #define PKT3_NOP 0x10 +#define PKT3_DISPATCH_DIRECT 0x15 +#define PKT3_DISPATCH_INDIRECT 0x16 #define PKT3_INDIRECT_BUFFER_END 0x17 #define PKT3_SET_PREDICATION 0x20 #define PKT3_REG_RMW 0x21 @@ -114,6 +116,11 @@ #define PKT3_PREDICATE(x) (((x) >> 0) & 0x1) #define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count)) +#define RADEON_CP_PACKET3_COMPUTE_MODE 0x00000002 + +/*Evergreen Compute packet3*/ +#define PKT3C(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate) | RADEON_CP_PACKET3_COMPUTE_MODE) + /* Registers */ #define R_0084FC_CP_STRMOUT_CNTL 0x000084FC #define S_0084FC_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0) @@ -241,6 +248,15 @@ #define G_008CF0_ALU_UPDATE_FIFO_HIWATER(x) (((x) >> 24) & 0x1F) #define C_008CF0_ALU_UPDATE_FIFO_HIWATER(x) 0xE0FFFFFF +#define R_008E20_SQ_STATIC_THREAD_MGMT1 0x8E20 +#define R_008E24_SQ_STATIC_THREAD_MGMT2 0x8E24 +#define R_008E28_SQ_STATIC_THREAD_MGMT3 0x8E28 + +#define R_00899C_VGT_COMPUTE_START_X 0x0000899C +#define R_0089A0_VGT_COMPUTE_START_Y 0x000089A0 +#define R_0089A4_VGT_COMPUTE_START_Z 0x000089A4 +#define R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE 0x000089AC + #define R_009100_SPI_CONFIG_CNTL 0x00009100 #define R_00913C_SPI_CONFIG_CNTL_1 0x0000913C #define S_00913C_VTX_DONE_DELAY(x) (((x) & 0xF) << 0) @@ -397,6 +413,11 @@ #define G_028410_ALPHA_TEST_BYPASS(x) (((x) >> 8) & 0x1) #define C_028410_ALPHA_TEST_BYPASS 0xFFFFFEFF +#define R_0286EC_SPI_COMPUTE_NUM_THREAD_X 0x0286EC +#define R_0286F0_SPI_COMPUTE_NUM_THREAD_Y 0x0286F0 +#define R_0286F4_SPI_COMPUTE_NUM_THREAD_Z 0x0286F4 +#define R_028B74_VGT_DISPATCH_INITIATOR 0x028B74 + #define R_028800_DB_DEPTH_CONTROL 0x028800 #define S_028800_STENCIL_ENABLE(x) (((x) & 0x1) << 0) #define G_028800_STENCIL_ENABLE(x) (((x) >> 0) & 0x1) @@ -747,6 +768,8 @@ #define S_028A40_CUT_MODE(x) (((x) & 0x3) << 3) #define G_028A40_CUT_MODE(x) (((x) >> 3) & 0x3) #define C_028A40_CUT_MODE 0xFFFFFFE7 +#define S_028A40_COMPUTE_MODE(x) (x << 14) +#define S_028A40_PARTIAL_THD_AT_EOI(x) (x << 17) #define R_028A6C_VGT_GS_OUT_PRIM_TYPE 0x028A6C #define S_028A6C_OUTPRIM_TYPE(x) (((x) & 0x3F) << 0) #define V_028A6C_OUTPRIM_TYPE_POINTLIST 0 @@ -1434,6 +1457,50 @@ #define G_028848_ALLOW_DOUBLE_DENORM_OUT(x) (((x) >> 7) & 0x1) #define C_028848_ALLOW_DOUBLE_DENORM_OUT 0xFFFFFF7F +#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 +#define S_0288D4_NUM_GPRS(x) (((x) & 0xFF) << 0) +#define G_0288D4_NUM_GPRS(x) (((x) >> 0) & 0xFF) +#define C_0288D4_NUM_GPRS 0xFFFFFF00 +#define S_0288D4_STACK_SIZE(x) (((x) & 0xFF) << 8) +#define G_0288D4_STACK_SIZE(x) (((x) >> 8) & 0xFF) +#define C_0288D4_STACK_SIZE 0xFFFF00FF +#define S_0288D4_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_0288D4_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_0288D4_DX10_CLAMP 0xFFDFFFFF +#define S_0288D4_PRIME_CACHE_ON_DRAW(x) (((x) & 0x1) << 23) +#define G_0288D4_PRIME_CACHE_ON_DRAW(x) (((x) >> 23) & 0x1) +#define S_0288D4_UNCACHED_FIRST_INST(x) (((x) & 0x1) << 28) +#define G_0288D4_UNCACHED_FIRST_INST(x) (((x) >> 28) & 0x1) +#define C_0288D4_UNCACHED_FIRST_INST 0xEFFFFFFF +#define S_0288D4_CLAMP_CONSTS(x) (((x) & 0x1) << 31) +#define G_0288D4_CLAMP_CONSTS(x) (((x) >> 31) & 0x1) +#define C_0288D4_CLAMP_CONSTS 0x7FFFFFFF + +#define R_0288D8_SQ_PGM_RESOURCES_LS_2 0x0288d8 + + +#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 +#define S_0288D4_NUM_GPRS(x) (((x) & 0xFF) << 0) +#define G_0288D4_NUM_GPRS(x) (((x) >> 0) & 0xFF) +#define C_0288D4_NUM_GPRS 0xFFFFFF00 +#define S_0288D4_STACK_SIZE(x) (((x) & 0xFF) << 8) +#define G_0288D4_STACK_SIZE(x) (((x) >> 8) & 0xFF) +#define C_0288D4_STACK_SIZE 0xFFFF00FF +#define S_0288D4_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_0288D4_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_0288D4_DX10_CLAMP 0xFFDFFFFF +#define S_0288D4_PRIME_CACHE_ON_DRAW(x) (((x) & 0x1) << 23) +#define G_0288D4_PRIME_CACHE_ON_DRAW(x) (((x) >> 23) & 0x1) +#define S_0288D4_UNCACHED_FIRST_INST(x) (((x) & 0x1) << 28) +#define G_0288D4_UNCACHED_FIRST_INST(x) (((x) >> 28) & 0x1) +#define C_0288D4_UNCACHED_FIRST_INST 0xEFFFFFFF +#define S_0288D4_CLAMP_CONSTS(x) (((x) & 0x1) << 31) +#define G_0288D4_CLAMP_CONSTS(x) (((x) >> 31) & 0x1) +#define C_0288D4_CLAMP_CONSTS 0x7FFFFFFF + +#define R_0288D8_SQ_PGM_RESOURCES_LS_2 0x0288d8 + + #define R_028644_SPI_PS_INPUT_CNTL_0 0x028644 #define S_028644_SEMANTIC(x) (((x) & 0xFF) << 0) #define G_028644_SEMANTIC(x) (((x) >> 0) & 0xFF) @@ -1710,6 +1777,12 @@ #define R_0286DC_SPI_FOG_CNTL 0x000286DC #define R_0286E4_SPI_PS_IN_CONTROL_2 0x000286E4 #define R_0286E8_SPI_COMPUTE_INPUT_CNTL 0x000286E8 +#define S_0286E8_TID_IN_GROUP_ENA 1 +#define S_0286E8_TGID_ENA 2 +#define S_0286E8_DISABLE_INDEX_PACK 4 +#define R_028720_GDS_ADDR_BASE 0x00028720 +#define R_028724_GDS_ADDR_SIZE 0x00028724 +#define R_028728_GDS_ORDERED_WAVE_PER_SE 0x00028728 #define R_028784_CB_BLEND1_CONTROL 0x00028784 #define R_028788_CB_BLEND2_CONTROL 0x00028788 #define R_02878C_CB_BLEND3_CONTROL 0x0002878C @@ -1736,6 +1809,7 @@ #define C_02884C_EXPORT_Z 0xFFFFFFFE #define R_02885C_SQ_PGM_START_VS 0x0002885C #define R_0288A4_SQ_PGM_START_FS 0x000288A4 +#define R_0288D0_SQ_PGM_START_LS 0x000288d0 #define R_0288A8_SQ_PGM_RESOURCES_FS 0x000288A8 #define R_0288EC_SQ_LDS_ALLOC_PS 0x000288EC #define R_028900_SQ_ESGS_RING_ITEMSIZE 0x00028900 diff --git a/src/gallium/drivers/r600/llvm_wrapper.cpp b/src/gallium/drivers/r600/llvm_wrapper.cpp new file mode 100644 index 00000000000..174fb013c83 --- /dev/null +++ b/src/gallium/drivers/r600/llvm_wrapper.cpp @@ -0,0 +1,19 @@ +#include +#include +#include +#include +#include +#include + +#include "llvm_wrapper.h" + + +extern "C" LLVMModuleRef llvm_parse_bitcode(const unsigned char * bitcode, unsigned bitcode_len) +{ + llvm::OwningPtr M; + llvm::StringRef str((const char*)bitcode, bitcode_len); + llvm::MemoryBuffer* buffer = llvm::MemoryBuffer::getMemBufferCopy(str); + llvm::SMDiagnostic Err; + M.reset(llvm::ParseIR(buffer, Err, llvm::getGlobalContext())); + return wrap(M.take()); +} diff --git a/src/gallium/drivers/r600/llvm_wrapper.h b/src/gallium/drivers/r600/llvm_wrapper.h new file mode 100644 index 00000000000..3a696455cdf --- /dev/null +++ b/src/gallium/drivers/r600/llvm_wrapper.h @@ -0,0 +1,16 @@ +#ifndef LLVM_WRAPPER_H +#define LLVM_WRAPPER_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +LLVMModuleRef llvm_parse_bitcode(const unsigned char * bitcode, unsigned bitcode_len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h index 0f6a1f88341..090d909a475 100644 --- a/src/gallium/drivers/r600/r600_llvm.h +++ b/src/gallium/drivers/r600/r600_llvm.h @@ -2,7 +2,7 @@ #ifndef R600_LLVM_H #define R600_LLVM_H -#ifdef R600_USE_LLVM +#if defined R600_USE_LLVM || defined HAVE_OPENCL #include "radeon_llvm.h" #include @@ -24,6 +24,6 @@ unsigned r600_llvm_compile( enum radeon_family family, unsigned dump); -#endif /* R600_USE_LLVM */ +#endif /* defined R600_USE_LLVM || defined HAVE_OPENCL */ #endif /* R600_LLVM_H */ diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index cb13ca767c9..e0ee823ce39 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -382,6 +382,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: case PIPE_CAP_USER_INDEX_BUFFERS: case PIPE_CAP_USER_CONSTANT_BUFFERS: + case PIPE_CAP_COMPUTE: return 1; case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: @@ -409,7 +410,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_USER_VERTEX_BUFFERS: - case PIPE_CAP_COMPUTE: return 0; /* Stream output. */ @@ -491,6 +491,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e { case PIPE_SHADER_FRAGMENT: case PIPE_SHADER_VERTEX: + case PIPE_SHADER_COMPUTE: break; case PIPE_SHADER_GEOMETRY: /* XXX: support and enable geometry programs */ @@ -538,8 +539,12 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e return rscreen->glsl_feature_level >= 130; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: return 16; - case PIPE_SHADER_CAP_PREFERRED_IR: - return PIPE_SHADER_IR_TGSI; + case PIPE_SHADER_CAP_PREFERRED_IR: + if (shader == PIPE_SHADER_COMPUTE) { + return PIPE_SHADER_IR_LLVM; + } else { + return PIPE_SHADER_IR_TGSI; + } } return 0; } @@ -569,6 +574,81 @@ static int r600_get_video_param(struct pipe_screen *screen, } } +static int r600_get_compute_param(struct pipe_screen *screen, + enum pipe_compute_cap param, + void *ret) +{ + //TODO: select these params by asic + switch (param) { + case PIPE_COMPUTE_CAP_IR_TARGET: + if (ret) { + strcpy(ret, "r600--"); + } + return 7 * sizeof(char); + + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + if (ret) { + uint64_t * grid_dimension = ret; + grid_dimension[0] = 3; + } + return 1 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + if (ret) { + uint64_t * grid_size = ret; + grid_size[0] = 65535; + grid_size[1] = 65535; + grid_size[2] = 1; + } + return 3 * sizeof(uint64_t) ; + + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + if (ret) { + uint64_t * block_size = ret; + block_size[0] = 256; + block_size[1] = 256; + block_size[2] = 256; + } + return 3 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + if (ret) { + uint64_t * max_threads_per_block = ret; + *max_threads_per_block = 256; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + if (ret) { + uint64_t * max_global_size = ret; + /* XXX: This is what the proprietary driver reports, we + * may want to use a different value. */ + *max_global_size = 201326592; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + if (ret) { + uint64_t * max_input_size = ret; + *max_input_size = 1024; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + if (ret) { + uint64_t * max_local_size = ret; + /* XXX: This is what the proprietary driver reports, we + * may want to use a different value. */ + *max_local_size = 32768; + } + return sizeof(uint64_t); + + default: + fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); + return 0; + } +} + static void r600_destroy_screen(struct pipe_screen* pscreen) { struct r600_screen *rscreen = (struct r600_screen *)pscreen; @@ -576,6 +656,10 @@ static void r600_destroy_screen(struct pipe_screen* pscreen) if (rscreen == NULL) return; + if (rscreen->global_pool) { + compute_memory_pool_delete(rscreen->global_pool); + } + if (rscreen->fences.bo) { struct r600_fence_block *entry, *tmp; @@ -833,6 +917,8 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws) rscreen->screen.get_shader_param = r600_get_shader_param; rscreen->screen.get_paramf = r600_get_paramf; rscreen->screen.get_video_param = r600_get_video_param; + rscreen->screen.get_compute_param = r600_get_compute_param; + if (rscreen->chip_class >= EVERGREEN) { rscreen->screen.is_format_supported = evergreen_is_format_supported; } else { @@ -857,5 +943,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws) rscreen->use_surface_alloc = debug_get_bool_option("R600_SURF", TRUE); rscreen->glsl_feature_level = debug_get_bool_option("R600_GLSL130", TRUE) ? 130 : 120; + rscreen->global_pool = compute_memory_pool_new(1024*16, rscreen); + return &rscreen->screen; } diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index e5ba49c5ac5..f2865d2a22e 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -28,8 +28,11 @@ #include "util/u_slab.h" #include "r600.h" +#include "r600_llvm.h" +#include "r600_public.h" #include "r600_shader.h" #include "r600_resource.h" +#include "evergreen_compute.h" #define R600_MAX_CONST_BUFFERS 2 #define R600_MAX_CONST_BUFFER_SIZE 4096 @@ -98,9 +101,16 @@ enum r600_pipe_state_id { R600_PIPE_STATE_RESOURCE, R600_PIPE_STATE_POLYGON_OFFSET, R600_PIPE_STATE_FETCH_SHADER, + R600_PIPE_STATE_SPI, R600_PIPE_NSTATES }; +struct compute_memory_pool; +void compute_memory_pool_delete(struct compute_memory_pool* pool); +struct compute_memory_pool* compute_memory_pool_new( + int64_t initial_size_in_dw, + struct r600_screen *rscreen); + struct r600_pipe_fences { struct r600_resource *bo; unsigned *data; @@ -123,6 +133,12 @@ struct r600_screen { bool use_surface_alloc; int glsl_feature_level; + + /*for compute global memory binding, we allocate stuff here, instead of + * buffers. + * XXX: Not sure if this is the best place for global_pool. Also, + * it's not thread safe, so it won't work with multiple contexts. */ + struct compute_memory_pool *global_pool; }; struct r600_pipe_sampler_view { @@ -257,6 +273,7 @@ struct r600_context { struct pipe_clip_state clip; struct r600_pipe_shader *ps_shader; struct r600_pipe_shader *vs_shader; + struct r600_pipe_compute *cs_shader; struct r600_pipe_rasterizer *rasterizer; struct r600_pipe_state vgt; struct r600_pipe_state spi; @@ -266,7 +283,9 @@ struct r600_context { unsigned saved_render_cond_mode; /* shader information */ boolean two_side; + boolean spi_dirty; unsigned sprite_coord_enable; + boolean flatshade; boolean export_16bpc; unsigned alpha_ref; boolean alpha_ref_dirty; @@ -412,6 +431,10 @@ void r600_init_context_resource_functions(struct r600_context *r600); /* r600_shader.c */ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader); +#ifdef HAVE_OPENCL +int r600_compute_shader_create(struct pipe_context * ctx, + LLVMModuleRef mod, struct r600_bytecode * bytecode); +#endif void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader); int r600_find_vs_semantic_index(struct r600_shader *vs, struct r600_shader *ps, int id); diff --git a/src/gallium/drivers/r600/r600_resource.c b/src/gallium/drivers/r600/r600_resource.c index ef0b4ceffd0..0c14a2dc6bc 100644 --- a/src/gallium/drivers/r600/r600_resource.c +++ b/src/gallium/drivers/r600/r600_resource.c @@ -27,7 +27,12 @@ static struct pipe_resource *r600_resource_create(struct pipe_screen *screen, const struct pipe_resource *templ) { if (templ->target == PIPE_BUFFER) { - return r600_buffer_create(screen, templ); + if (templ->bind & PIPE_BIND_GLOBAL) { + return r600_compute_global_buffer_create(screen, templ); + } + else { + return r600_buffer_create(screen, templ); + } } else { return r600_texture_create(screen, templ); } @@ -44,12 +49,21 @@ static struct pipe_resource *r600_resource_from_handle(struct pipe_screen * scre } } +void r600_resource_destroy(struct pipe_screen *screen, struct pipe_resource *res) +{ + if (res->target == PIPE_BUFFER && (res->bind & PIPE_BIND_GLOBAL)) { + r600_compute_global_buffer_destroy(screen, res); + } else { + u_resource_destroy_vtbl(screen, res); + } +} + void r600_init_screen_resource_functions(struct pipe_screen *screen) { screen->resource_create = r600_resource_create; screen->resource_from_handle = r600_resource_from_handle; screen->resource_get_handle = u_resource_get_handle_vtbl; - screen->resource_destroy = u_resource_destroy_vtbl; + screen->resource_destroy = r600_resource_destroy; } void r600_init_context_resource_functions(struct r600_context *r600) diff --git a/src/gallium/drivers/r600/r600_resource.h b/src/gallium/drivers/r600/r600_resource.h index 87bef730654..d401e40c5ba 100644 --- a/src/gallium/drivers/r600/r600_resource.h +++ b/src/gallium/drivers/r600/r600_resource.h @@ -34,6 +34,13 @@ struct r600_transfer { unsigned offset; }; +struct compute_memory_item; + +struct r600_resource_global { + struct r600_resource base; + struct compute_memory_item *chunk; +}; + struct r600_resource_texture { struct r600_resource resource; @@ -65,6 +72,7 @@ struct r600_surface { unsigned aligned_height; }; +void r600_resource_destroy(struct pipe_screen *screen, struct pipe_resource *res); void r600_init_screen_resource_functions(struct pipe_screen *screen); /* r600_texture */ diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index dc208b923cb..5f3c76eafbb 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -225,6 +225,37 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx); * struct r600_bytecode. */ +static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx, + unsigned char * bytes, unsigned num_bytes); + +#ifdef HAVE_OPENCL +int r600_compute_shader_create(struct pipe_context * ctx, + LLVMModuleRef mod, struct r600_bytecode * bytecode) +{ + struct r600_context *r600_ctx = (struct r600_context *)ctx; + unsigned char * bytes; + unsigned byte_count; + struct r600_shader_ctx shader_ctx; + unsigned dump = 0; + + if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) { + dump = 1; + } + + r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump); + shader_ctx.bc = bytecode; + r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family); + shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE; + r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count); + r600_bytecode_build(shader_ctx.bc); + if (dump) { + r600_bytecode_dump(shader_ctx.bc); + } + return 1; +} + +#endif /* HAVE_OPENCL */ + static unsigned r600_src_from_byte_stream(unsigned char * bytes, unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx) { diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c index d6f85c38c32..5b159908adb 100644 --- a/src/gallium/drivers/r600/r600_texture.c +++ b/src/gallium/drivers/r600/r600_texture.c @@ -916,6 +916,10 @@ void* r600_texture_transfer_map(struct pipe_context *ctx, unsigned offset = 0; char *map; + if ((transfer->resource->bind & PIPE_BIND_GLOBAL) && transfer->resource->target == PIPE_BUFFER) { + return r600_compute_global_transfer_map(ctx, transfer); + } + if (rtransfer->staging) { buf = ((struct r600_resource *)rtransfer->staging)->cs_buf; } else { @@ -945,6 +949,10 @@ void r600_texture_transfer_unmap(struct pipe_context *ctx, struct r600_context *rctx = (struct r600_context*)ctx; struct radeon_winsys_cs_handle *buf; + if ((transfer->resource->bind & PIPE_BIND_GLOBAL) && transfer->resource->target == PIPE_BUFFER) { + return r600_compute_global_transfer_unmap(ctx, transfer); + } + if (rtransfer->staging) { buf = ((struct r600_resource *)rtransfer->staging)->cs_buf; } else {