r600g: compute support for evergreen
authorAdam Rak <adam.rak@streamnovation.com>
Wed, 30 Nov 2011 21:20:41 +0000 (22:20 +0100)
committerTom Stellard <thomas.stellard@amd.com>
Fri, 1 Jun 2012 15:28:10 +0000 (11:28 -0400)
Tom Stellard:
  - Updated for gallium interface changes
  - Fixed a few bugs:
    + Set the loop counter
    + Calculate the correct number of pipes
  - Added hooks into the LLVM compiler

21 files changed:
configure.ac
src/gallium/drivers/r600/Makefile.am
src/gallium/drivers/r600/Makefile.sources
src/gallium/drivers/r600/compute_memory_pool.c [new file with mode: 0644]
src/gallium/drivers/r600/compute_memory_pool.h [new file with mode: 0644]
src/gallium/drivers/r600/compute_resource.def [new file with mode: 0644]
src/gallium/drivers/r600/evergreen_compute.c [new file with mode: 0644]
src/gallium/drivers/r600/evergreen_compute.h [new file with mode: 0644]
src/gallium/drivers/r600/evergreen_compute_internal.c [new file with mode: 0644]
src/gallium/drivers/r600/evergreen_compute_internal.h [new file with mode: 0644]
src/gallium/drivers/r600/evergreen_state.c
src/gallium/drivers/r600/evergreend.h
src/gallium/drivers/r600/llvm_wrapper.cpp [new file with mode: 0644]
src/gallium/drivers/r600/llvm_wrapper.h [new file with mode: 0644]
src/gallium/drivers/r600/r600_llvm.h
src/gallium/drivers/r600/r600_pipe.c
src/gallium/drivers/r600/r600_pipe.h
src/gallium/drivers/r600/r600_resource.c
src/gallium/drivers/r600/r600_resource.h
src/gallium/drivers/r600/r600_shader.c
src/gallium/drivers/r600/r600_texture.c

index db68a87f0070438c221d0ebcad93036d77a161fd..527accca52201422c3bb31d8b22231073ce7692d 100644 (file)
@@ -1993,13 +1993,18 @@ if test "x$with_gallium_drivers" != x; then
             PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
             gallium_require_drm_loader
             GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS r600"
-            if test "x$enable_r600_llvm" = xyes; then
+            if test "x$enable_r600_llvm" = xyes -o "x$enable_opencl" = xyes; then
                 if test "x$LLVM_VERSION" != "x3.1"; then
                     AC_MSG_ERROR([LLVM 3.1 is required for the r600 llvm compiler.])
                 fi
                 NEED_RADEON_GALLIUM=yes;
+            fi
+            if test "x$enable_r600_llvm" = xyes; then
                 USE_R600_LLVM_COMPILER=yes;
             fi
+            if test "x$enable_opencl" = xyes -a "x$with_llvm_shared_libs" = xno; then
+                LLVM_LIBS="${LLVM_LIBS} `llvm-config --libs bitreader asmparser`"
+            fi
             gallium_check_st "radeon/drm" "dri-r600" "xorg-r600" "" "xvmc-r600" "vdpau-r600" "va-r600"
             ;;
         xradeonsi)
index 77d2674d262276197ff719e65d335a92d354251f..31d885a34168fdb45eab8b95cf1281e13e921ba3 100644 (file)
@@ -18,7 +18,7 @@ AM_CFLAGS = \
 libr600_a_SOURCES = \
        $(C_SOURCES)
 
-if USE_R600_LLVM_COMPILER
+if NEED_RADEON_GALLIUM
 
 # This is a hack until we can move the backend into the LLVM project.
 # We need to use mklib, because it splits up libradeon.a into object files
@@ -26,18 +26,28 @@ if USE_R600_LLVM_COMPILER
 libr600_a_AR = $(top_srcdir)/bin/mklib -o r600 -static
 
 libr600_a_SOURCES += \
-       $(LLVM_C_SOURCES)
+       $(LLVM_C_SOURCES) \
+       $(LLVM_CXX_SOURCES)
 
 libr600_a_LIBADD = \
        $(top_builddir)/src/gallium/drivers/radeon/libradeon.a
 
 AM_CFLAGS += \
        $(LLVM_CFLAGS) \
-       -I$(top_srcdir)/src/gallium/drivers/radeon/ \
-       -DR600_USE_LLVM
+       -I$(top_srcdir)/src/gallium/drivers/radeon/
 
 AM_CXXFLAGS= \
        $(LLVM_CXXFLAGS)
 else
 libr600_a_AR = $(AR) $(ARFLAGS)
 endif
+
+if USE_R600_LLVM_COMPILER
+AM_CFLAGS += \
+       -DR600_USE_LLVM
+endif
+
+if HAVE_GALLIUM_COMPUTE
+AM_CFLAGS += \
+       -DHAVE_OPENCL
+endif
index b7b0d50b6376f06968799faa9155e9f47c7b09cb..50546e6fb2f5e1e7a0676b5287881056fed5e82a 100644 (file)
@@ -14,6 +14,10 @@ C_SOURCES = \
        evergreen_state.c \
        eg_asm.c \
        r600_translate.c \
-       r600_state_common.c
+       r600_state_common.c \
+       evergreen_compute.c \
+       evergreen_compute_internal.c \
+       compute_memory_pool.c
 
 LLVM_C_SOURCES = r600_llvm.c
+LLVM_CXX_SOURCES = llvm_wrapper.cpp
diff --git a/src/gallium/drivers/r600/compute_memory_pool.c b/src/gallium/drivers/r600/compute_memory_pool.c
new file mode 100644 (file)
index 0000000..01bf0c3
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "r600.h"
+#include "r600_resource.h"
+#include "r600_shader.h"
+#include "r600_pipe.h"
+#include "r600_formats.h"
+#include "compute_memory_pool.h"
+#include "evergreen_compute_internal.h"
+
+/**
+ * Creates a new pool
+ */
+struct compute_memory_pool* compute_memory_pool_new(
+       int64_t initial_size_in_dw,
+       struct r600_screen * rscreen)
+{
+       struct compute_memory_pool* pool = (struct compute_memory_pool*)
+                               CALLOC(sizeof(struct compute_memory_pool), 1);
+
+       pool->next_id = 1;
+       pool->size_in_dw = initial_size_in_dw;
+       pool->screen = rscreen;
+       pool->bo = (struct r600_resource*)r600_compute_buffer_alloc_vram(
+                                       pool->screen, pool->size_in_dw*4);
+       pool->shadow = (uint32_t*)CALLOC(4, pool->size_in_dw);
+
+       return pool;
+}
+
+/**
+ * Frees all stuff in the pool and the pool struct itself too
+ */
+void compute_memory_pool_delete(struct compute_memory_pool* pool)
+{
+       free(pool->shadow);
+       pool->screen->screen.resource_destroy((struct pipe_screen *)
+                       pool->screen, (struct pipe_resource *)pool->bo);
+       free(pool);
+}
+
+/**
+ * Searches for an empty space in the pool, return with the pointer to the
+ * allocatable space in the pool, returns -1 on failure.
+ */
+int64_t compute_memory_prealloc_chunk(
+       struct compute_memory_pool* pool,
+       int64_t size_in_dw)
+{
+       assert(size_in_dw <= pool->size_in_dw);
+
+       struct compute_memory_item *item;
+
+       int last_end = 0;
+
+       for (item = pool->item_list; item; item = item->next) {
+               if (item->start_in_dw > -1) {
+                       if (item->start_in_dw-last_end > size_in_dw) {
+                               return last_end;
+                       }
+
+                       last_end = item->start_in_dw + item->size_in_dw;
+                       last_end += (1024 - last_end % 1024);
+               }
+       }
+
+       if (pool->size_in_dw - last_end < size_in_dw) {
+               return -1;
+       }
+
+       return last_end;
+}
+
+/**
+ *  Search for the chunk where we can link our new chunk after it.
+ */
+struct compute_memory_item* compute_memory_postalloc_chunk(
+       struct compute_memory_pool* pool,
+       int64_t start_in_dw)
+{
+       struct compute_memory_item* item;
+
+       for (item = pool->item_list; item; item = item->next) {
+               if (item->next) {
+                       if (item->start_in_dw < start_in_dw
+                               && item->next->start_in_dw > start_in_dw) {
+                               return item;
+                       }
+               }
+               else {
+                       /* end of chain */
+                       assert(item->start_in_dw < start_in_dw);
+                       return item;
+               }
+       }
+
+       assert(0 && "unreachable");
+       return NULL;
+}
+
+/**
+ * Reallocates pool, conserves data
+ */
+void compute_memory_grow_pool(struct compute_memory_pool* pool,
+       struct pipe_context * pipe, int new_size_in_dw)
+{
+       assert(new_size_in_dw >= pool->size_in_dw);
+
+       new_size_in_dw += 1024 - (new_size_in_dw % 1024);
+
+       compute_memory_shadow(pool, pipe, 1);
+       pool->shadow = (uint32_t*)realloc(pool->shadow, new_size_in_dw*4);
+       pool->size_in_dw = new_size_in_dw;
+       pool->screen->screen.resource_destroy(
+               (struct pipe_screen *)pool->screen,
+               (struct pipe_resource *)pool->bo);
+       pool->bo = r600_compute_buffer_alloc_vram(pool->screen,
+                                               pool->size_in_dw*4);
+       compute_memory_shadow(pool, pipe, 0);
+}
+
+/**
+ * Copy pool from device to host, or host to device.
+ */
+void compute_memory_shadow(struct compute_memory_pool* pool,
+       struct pipe_context * pipe, int device_to_host)
+{
+       struct compute_memory_item chunk;
+
+       chunk.id = 0;
+       chunk.start_in_dw = 0;
+       chunk.size_in_dw = pool->size_in_dw;
+       chunk.prev = chunk.next = NULL;
+       compute_memory_transfer(pool, pipe, device_to_host, &chunk,
+                               pool->shadow, 0, pool->size_in_dw*4);
+}
+
+/**
+ * Allocates pending allocations in the pool
+ */
+void compute_memory_finalize_pending(struct compute_memory_pool* pool,
+       struct pipe_context * pipe)
+{
+       struct compute_memory_item *pending_list = NULL, *end_p = NULL;
+       struct compute_memory_item *item, *next;
+
+       int64_t allocated = 0;
+       int64_t unallocated = 0;
+
+       for (item = pool->item_list; item; item = item->next) {
+               COMPUTE_DBG("list: %i %p\n", item->start_in_dw, item->next);
+       }
+
+       for (item = pool->item_list; item; item = next) {
+               next = item->next;
+
+
+               if (item->start_in_dw == -1) {
+                       if (end_p) {
+                               end_p->next = item;
+                       }
+                       else {
+                               pending_list = item;
+                       }
+
+                       if (item->prev) {
+                               item->prev->next = next;
+                       }
+                       else {
+                               pool->item_list = next;
+                       }
+
+                       if (next) {
+                               next->prev = item->prev;
+                       }
+
+                       item->prev = end_p;
+                       item->next = NULL;
+                       end_p = item;
+
+                       unallocated += item->size_in_dw+1024;
+               }
+               else {
+                       allocated += item->size_in_dw;
+               }
+       }
+
+       if (pool->size_in_dw < allocated+unallocated) {
+               compute_memory_grow_pool(pool, pipe, allocated+unallocated);
+       }
+
+       for (item = pending_list; item; item = next) {
+               next = item->next;
+
+               int64_t start_in_dw;
+
+               while ((start_in_dw=compute_memory_prealloc_chunk(pool,
+                                               item->size_in_dw)) == -1) {
+                       int64_t need = item->size_in_dw+2048 -
+                                               (pool->size_in_dw - allocated);
+
+                       need += 1024 - (need % 1024);
+
+                       if (need > 0) {
+                               compute_memory_grow_pool(pool,
+                                               pipe,
+                                               pool->size_in_dw + need);
+                       }
+                       else {
+                               need = pool->size_in_dw / 10;
+                               need += 1024 - (need % 1024);
+                               compute_memory_grow_pool(pool,
+                                               pipe,
+                                               pool->size_in_dw + need);
+                       }
+               }
+
+               item->start_in_dw = start_in_dw;
+               item->next = NULL;
+               item->prev = NULL;
+
+               if (pool->item_list) {
+                       struct compute_memory_item *pos;
+
+                       pos = compute_memory_postalloc_chunk(pool, start_in_dw);
+                       item->prev = pos;
+                       item->next = pos->next;
+                       pos->next = item;
+
+                       if (item->next) {
+                               item->next->prev = item;
+                       }
+               }
+               else {
+                       pool->item_list = item;
+               }
+
+               allocated += item->size_in_dw;
+       }
+}
+
+
+void compute_memory_free(struct compute_memory_pool* pool, int64_t id)
+{
+       struct compute_memory_item *item, *next;
+
+       for (item = pool->item_list; item; item = next) {
+               next = item->next;
+
+               if (item->id == id) {
+                       if (item->prev) {
+                               item->prev->next = item->next;
+                       }
+                       else {
+                               pool->item_list = item->next;
+                       }
+
+                       if (item->next) {
+                               item->next->prev = item->prev;
+                       }
+
+                       free(item);
+
+                       return;
+               }
+       }
+
+       fprintf(stderr, "Internal error, invalid id %ld "
+               "for compute_memory_free\n", id);
+
+       assert(0 && "error");
+}
+
+/**
+ * Creates pending allocations
+ */
+struct compute_memory_item* compute_memory_alloc(
+       struct compute_memory_pool* pool,
+       int64_t size_in_dw)
+{
+       struct compute_memory_item *new_item;
+
+       COMPUTE_DBG("Alloc: %i\n", size_in_dw);
+
+       new_item = (struct compute_memory_item *)
+                               CALLOC(sizeof(struct compute_memory_item), 1);
+       new_item->size_in_dw = size_in_dw;
+       new_item->start_in_dw = -1; /* mark pending */
+       new_item->id = pool->next_id++;
+       new_item->pool = pool;
+
+       struct compute_memory_item *last_item;
+
+       if (pool->item_list) {
+               for (last_item = pool->item_list; last_item->next;
+                                               last_item = last_item->next);
+
+               last_item->next = new_item;
+               new_item->prev = last_item;
+       }
+       else {
+               pool->item_list = new_item;
+       }
+
+       return new_item;
+}
+
+/**
+ * Transfer data host<->device, offset and size is in bytes
+ */
+void compute_memory_transfer(
+       struct compute_memory_pool* pool,
+       struct pipe_context * pipe,
+       int device_to_host,
+       struct compute_memory_item* chunk,
+       void* data,
+       int offset_in_chunk,
+       int size)
+{
+       int64_t aligned_size = pool->size_in_dw;
+       struct pipe_resource* gart = (struct pipe_resource*)pool->bo;
+       int64_t internal_offset = chunk->start_in_dw*4 + offset_in_chunk;
+
+       struct pipe_transfer *xfer;
+       uint32_t *map;
+
+       if (device_to_host)
+       {
+               xfer = pipe->get_transfer(pipe, gart, 0, PIPE_TRANSFER_READ,
+                       &(struct pipe_box) { .width = aligned_size,
+                       .height = 1, .depth = 1 });
+               assert(xfer);
+               map = pipe->transfer_map(pipe, xfer);
+               assert(map);
+               memcpy(data, map + internal_offset, size);
+               pipe->transfer_unmap(pipe, xfer);
+               pipe->transfer_destroy(pipe, xfer);
+       } else {
+               xfer = pipe->get_transfer(pipe, gart, 0, PIPE_TRANSFER_WRITE,
+                       &(struct pipe_box) { .width = aligned_size,
+                       .height = 1, .depth = 1 });
+               assert(xfer);
+               map = pipe->transfer_map(pipe, xfer);
+               assert(map);
+               memcpy(map + internal_offset, data, size);
+               pipe->transfer_unmap(pipe, xfer);
+               pipe->transfer_destroy(pipe, xfer);
+       }
+}
+
+/**
+ * Transfer data between chunk<->data, it is for VRAM<->GART transfers
+ */
+void compute_memory_transfer_direct(
+       struct compute_memory_pool* pool,
+       int chunk_to_data,
+       struct compute_memory_item* chunk,
+       struct r600_resource* data,
+       int offset_in_chunk,
+       int offset_in_data,
+       int size)
+{
+       ///TODO: DMA
+}
diff --git a/src/gallium/drivers/r600/compute_memory_pool.h b/src/gallium/drivers/r600/compute_memory_pool.h
new file mode 100644 (file)
index 0000000..a14eba1
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#ifndef COMPUTE_MEMORY_POOL
+#define COMPUTE_MEMORY_POOL
+
+#include <stdlib.h>
+
+struct compute_memory_pool;
+
+struct compute_memory_item
+{
+       int64_t id; ///ID of the memory chunk
+
+       int untouched; ///True if the memory contains only junk, no need to save it for defrag
+
+       int64_t start_in_dw; ///Start pointer in dwords relative in the pool bo
+       int64_t size_in_dw; ///Size of the chunk in dwords
+
+       struct compute_memory_pool* pool;
+
+       struct compute_memory_item* prev;
+       struct compute_memory_item* next;
+};
+
+struct compute_memory_pool
+{
+       int64_t next_id; ///For generating unique IDs for memory chunks
+       int64_t size_in_dw; ///Size of the pool in dwords
+
+       struct r600_resource *bo; ///The pool buffer object resource
+       struct compute_memory_item* item_list; ///Allocated memory chunks in the buffer,they must be ordered by "start_in_dw"
+       struct r600_screen *screen;
+
+       uint32_t *shadow; ///host copy of the pool, used for defragmentation
+};
+
+
+struct compute_memory_pool* compute_memory_pool_new(int64_t initial_size_in_dw, struct r600_screen *rscreen); ///Creates a new pool
+void compute_memory_pool_delete(struct compute_memory_pool* pool); ///Frees all stuff in the pool and the pool struct itself too
+
+int64_t compute_memory_prealloc_chunk(struct compute_memory_pool* pool, int64_t size_in_dw); ///searches for an empty space in the pool, return with the pointer to the allocatable space in the pool, returns -1 on failure
+
+struct compute_memory_item* compute_memory_postalloc_chunk(struct compute_memory_pool* pool, int64_t start_in_dw); ///search for the chunk where we can link our new chunk after it
+
+/** 
+ * reallocates pool, conserves data
+ */
+void compute_memory_grow_pool(struct compute_memory_pool* pool, struct pipe_context * pipe,
+       int new_size_in_dw);
+
+/**
+ * Copy pool from device to host, or host to device
+ */
+void compute_memory_shadow(struct compute_memory_pool* pool,
+       struct pipe_context * pipe, int device_to_host);
+
+/**
+ * Allocates pending allocations in the pool
+ */
+void compute_memory_finalize_pending(struct compute_memory_pool* pool,
+       struct pipe_context * pipe);
+void compute_memory_defrag(struct compute_memory_pool* pool); ///Defragment the memory pool, always heavy memory usage
+void compute_memory_free(struct compute_memory_pool* pool, int64_t id);
+struct compute_memory_item* compute_memory_alloc(struct compute_memory_pool* pool, int64_t size_in_dw); ///Creates pending allocations
+
+/**
+ * Transfer data host<->device, offset and size is in bytes
+ */
+void compute_memory_transfer(struct compute_memory_pool* pool,
+       struct pipe_context * pipe, int device_to_host,
+       struct compute_memory_item* chunk, void* data,
+       int offset_in_chunk, int size);
+
+void compute_memory_transfer_direct(struct compute_memory_pool* pool, int chunk_to_data, struct compute_memory_item* chunk, struct r600_resource* data, int offset_in_chunk, int offset_in_data, int size); ///Transfer data between chunk<->data, it is for VRAM<->GART transfers
+
+#endif
diff --git a/src/gallium/drivers/r600/compute_resource.def b/src/gallium/drivers/r600/compute_resource.def
new file mode 100644 (file)
index 0000000..161f506
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+
+DECL_COMPUTE_RESOURCE(CONFIG, 1)
+DECL_COMPUTE_RESOURCE(CONST_MEM, 16)
+DECL_COMPUTE_RESOURCE(RAT, 12)
+DECL_COMPUTE_RESOURCE(VERT, 16)
+DECL_COMPUTE_RESOURCE(TEX, 16)
+DECL_COMPUTE_RESOURCE(SAMPLER, 18)
+DECL_COMPUTE_RESOURCE(LOOP, 32)
+DECL_COMPUTE_RESOURCE(LDS, 1)
+DECL_COMPUTE_RESOURCE(GDS, 1)
+DECL_COMPUTE_RESOURCE(EXPORT, 1)
+DECL_COMPUTE_RESOURCE(SHADER, 1)
+DECL_COMPUTE_RESOURCE(TMPRING, 4)
+DECL_COMPUTE_RESOURCE(DISPATCH, 1)
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
new file mode 100644 (file)
index 0000000..7aeb403
--- /dev/null
@@ -0,0 +1,814 @@
+/*
+ * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "pipebuffer/pb_buffer.h"
+#include "r600.h"
+#include "evergreend.h"
+#include "r600_resource.h"
+#include "r600_shader.h"
+#include "r600_pipe.h"
+#include "r600_formats.h"
+#include "evergreen_compute.h"
+#include "r600_hw_context_priv.h"
+#include "evergreen_compute_internal.h"
+#include "compute_memory_pool.h"
+#ifdef HAVE_OPENCL
+#include "llvm_wrapper.h"
+#endif
+
+/**
+RAT0 is for global binding write
+VTX1 is for global binding read
+
+for wrting images RAT1...
+for reading images TEX2...
+  TEX2-RAT1 is paired
+
+TEX2... consumes the same fetch resources, that VTX2... would consume
+
+CONST0 and VTX0 is for parameters
+  CONST0 is binding smaller input parameter buffer, and for constant indexing,
+  also constant cached
+  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
+  the constant cache can handle
+
+RAT-s are limited to 12, so we can only bind at most 11 texture for writing
+because we reserve RAT0 for global bindings. With byteaddressing enabled,
+we should reserve another one too.=> 10 image binding for writing max.
+
+from Nvidia OpenCL:
+  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
+  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8 
+
+so 10 for writing is enough. 176 is the max for reading according to the docs
+
+writable images should be listed first < 10, so their id corresponds to RAT(id+1)
+writable images will consume TEX slots, VTX slots too because of linear indexing
+
+*/
+
+const struct u_resource_vtbl r600_global_buffer_vtbl =
+{
+       u_default_resource_get_handle, /* get_handle */
+       r600_compute_global_buffer_destroy, /* resource_destroy */
+       r600_compute_global_get_transfer, /* get_transfer */
+       r600_compute_global_transfer_destroy, /* transfer_destroy */
+       r600_compute_global_transfer_map, /* transfer_map */
+       r600_compute_global_transfer_flush_region,/* transfer_flush_region */
+       r600_compute_global_transfer_unmap, /* transfer_unmap */
+       r600_compute_global_transfer_inline_write /* transfer_inline_write */
+};
+
+
+void *evergreen_create_compute_state(
+       struct pipe_context *ctx_,
+       const const struct pipe_compute_state *cso)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+
+#ifdef HAVE_OPENCL
+       const struct pipe_llvm_program_header * header;
+       const unsigned char * code;
+
+       header = cso->prog;
+       code = cso->prog + sizeof(struct pipe_llvm_program_header);
+#endif
+
+       if (!ctx->screen->screen.get_param(&ctx->screen->screen,
+                                                       PIPE_CAP_COMPUTE)) {
+               fprintf(stderr, "Compute is not supported\n");
+               return NULL;
+       }
+       struct r600_pipe_compute *shader =      CALLOC_STRUCT(r600_pipe_compute);
+
+       shader->ctx = (struct r600_context*)ctx;
+       shader->resources = (struct evergreen_compute_resource*)
+                       CALLOC(sizeof(struct evergreen_compute_resource),
+                       get_compute_resource_num());
+       shader->local_size = cso->req_local_mem; ///TODO: assert it
+       shader->private_size = cso->req_private_mem;
+       shader->input_size = cso->req_input_mem;
+
+#ifdef HAVE_OPENCL 
+       shader->mod = llvm_parse_bitcode(code, header->num_bytes);
+
+       r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
+#endif
+       return shader;
+}
+
+void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
+{
+       struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
+
+       free(shader->resources);
+       free(shader);
+}
+
+static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+
+       ctx->cs_shader = (struct r600_pipe_compute *)state;
+
+       assert(!ctx->cs_shader->shader_code_bo);
+
+       ctx->cs_shader->shader_code_bo =
+               r600_compute_buffer_alloc_vram(ctx->screen,
+                                       ctx->cs_shader->bc.ndw * 4);
+
+       void *p = ctx->ws->buffer_map(ctx->cs_shader->shader_code_bo->cs_buf,
+                                       ctx->cs, PIPE_TRANSFER_WRITE);
+
+       memcpy(p, ctx->cs_shader->bc.bytecode, ctx->cs_shader->bc.ndw * 4);
+
+       ctx->ws->buffer_unmap(ctx->cs_shader->shader_code_bo->cs_buf);
+
+       evergreen_compute_init_config(ctx);
+
+       struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
+                                               COMPUTE_RESOURCE_SHADER, 0);
+
+       evergreen_reg_set(res, R_008C0C_SQ_GPR_RESOURCE_MGMT_3,
+                       S_008C0C_NUM_LS_GPRS(ctx->cs_shader->bc.ngpr));
+
+       ///maybe we can use it later
+       evergreen_reg_set(res, R_0286C8_SPI_THREAD_GROUPING, 0);
+       ///maybe we can use it later
+       evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
+
+       evergreen_reg_set(res, R_0288D4_SQ_PGM_RESOURCES_LS,
+               S_0288D4_NUM_GPRS(ctx->cs_shader->bc.ngpr)
+               | S_0288D4_STACK_SIZE(ctx->cs_shader->bc.nstack));
+       evergreen_reg_set(res, R_0288D8_SQ_PGM_RESOURCES_LS_2, 0);
+
+       evergreen_reg_set(res, R_0288D0_SQ_PGM_START_LS, 0);
+       res->bo = ctx->cs_shader->shader_code_bo;
+       res->usage = RADEON_USAGE_READ;
+       res->coher_bo_size = ctx->cs_shader->bc.ndw*4;
+       res->flags = COMPUTE_RES_SH_FLUSH;
+
+       /* We can't always determine the
+        * number of iterations in a loop before it's executed,
+        * so we just need to set up the loop counter to give us the maximum
+        * number of iterations possible.  Currently, loops in shader code
+        * ignore the loop counter and use a break instruction to exit the
+        * loop at the correct time.
+        */
+       evergreen_set_loop_const(ctx->cs_shader,
+               0, /* index */
+               0xFFF, /* Maximum value of the loop counter (i.e. when the loop
+                       * counter reaches this value, the program will break
+                       * out of the loop. */
+               0x0,   /* Starting value of the loop counter. */
+               0x1);  /* Amount to increment the loop counter each iteration. */
+}
+
+/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
+ * kernel parameters there are inplicit parameters that need to be stored
+ * in the vertex buffer as well.  Here is how these parameters are organized in
+ * the buffer:
+ *
+ * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
+ * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
+ * DWORDS 6-8: Number of work items within each work group in each dimension
+ *             (x,y,z)
+ * DWORDS 9+ : Kernel parameters
+ */
+void evergreen_compute_upload_input(
+       struct pipe_context *ctx_,
+       const uint *block_layout,
+       const uint *grid_layout,
+       const void *input)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       int i;
+       unsigned kernel_parameters_offset_bytes = 36;
+       uint32_t * num_work_groups_start;
+       uint32_t * global_size_start;
+       uint32_t * local_size_start;
+       uint32_t * kernel_parameters_start;
+
+       if (ctx->cs_shader->input_size == 0) {
+               return;
+       }
+
+       if (!ctx->cs_shader->kernel_param) {
+               unsigned buffer_size = ctx->cs_shader->input_size;
+
+               /* Add space for the grid dimensions */
+               buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
+               ctx->cs_shader->kernel_param =
+                               r600_compute_buffer_alloc_vram(ctx->screen,
+                                               buffer_size);
+       }
+
+       num_work_groups_start = ctx->ws->buffer_map(
+                       ctx->cs_shader->kernel_param->cs_buf,
+                       ctx->cs, PIPE_TRANSFER_WRITE);
+       global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
+       local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
+       kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
+
+       /* Copy the work group size */
+       memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
+
+       /* Copy the global size */
+       for (i = 0; i < 3; i++) {
+               global_size_start[i] = grid_layout[i] * block_layout[i];
+       }
+
+       /* Copy the local dimensions */
+       memcpy(local_size_start, block_layout, 3 * sizeof(uint));
+
+       /* Copy the kernel inputs */
+       memcpy(kernel_parameters_start, input, ctx->cs_shader->input_size);
+
+       for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
+                                       (ctx->cs_shader->input_size / 4); i++) {
+               COMPUTE_DBG("input %i : %i\n", i,
+                       ((unsigned*)num_work_groups_start)[i]);
+       }
+
+       ctx->ws->buffer_unmap(ctx->cs_shader->kernel_param->cs_buf);
+
+       ///ID=0 is reserved for the parameters
+       evergreen_set_vtx_resource(ctx->cs_shader,
+               ctx->cs_shader->kernel_param, 0, 0, 0);
+       ///ID=0 is reserved for parameters
+       evergreen_set_const_cache(ctx->cs_shader, 0,
+               ctx->cs_shader->kernel_param, ctx->cs_shader->input_size, 0);
+}
+
+void evergreen_direct_dispatch(
+               struct pipe_context *ctx_,
+               const uint *block_layout, const uint *grid_layout)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+
+       int i;
+
+       struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
+               COMPUTE_RESOURCE_DISPATCH, 0);
+
+       evergreen_reg_set(res, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST);
+
+       evergreen_reg_set(res, R_00899C_VGT_COMPUTE_START_X, 0);
+       evergreen_reg_set(res, R_0089A0_VGT_COMPUTE_START_Y, 0);
+       evergreen_reg_set(res, R_0089A4_VGT_COMPUTE_START_Z, 0);
+
+       evergreen_reg_set(res, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, block_layout[0]);
+       evergreen_reg_set(res, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y, block_layout[1]);
+       evergreen_reg_set(res, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z, block_layout[2]);
+
+       int group_size = 1;
+
+       int grid_size = 1;
+
+       for (i = 0; i < 3; i++) {
+               group_size *= block_layout[i];
+       }
+
+       for (i = 0; i < 3; i++) {
+               grid_size *= grid_layout[i];
+       }
+
+       evergreen_reg_set(res, R_008970_VGT_NUM_INDICES, group_size);
+       evergreen_reg_set(res, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size);
+
+       evergreen_emit_raw_value(res, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
+       evergreen_emit_raw_value(res, grid_layout[0]);
+       evergreen_emit_raw_value(res, grid_layout[1]);
+       evergreen_emit_raw_value(res, grid_layout[2]);
+       ///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN
+       evergreen_emit_raw_value(res, 1);
+}
+
+static void compute_emit_cs(struct r600_context *ctx)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       int i;
+
+       r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
+
+       struct r600_resource *onebo = NULL;
+
+       for (i = 0; i < get_compute_resource_num(); i++) {
+               if (ctx->cs_shader->resources[i].enabled) {
+                       int j;
+                       COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
+
+                       for (j = 0; j < ctx->cs_shader->resources[i].cs_end; j++) {
+                               if (ctx->cs_shader->resources[i].do_reloc[j]) {
+                                       assert(ctx->cs_shader->resources[i].bo);
+                                       evergreen_emit_ctx_reloc(ctx,
+                                               ctx->cs_shader->resources[i].bo,
+                                               ctx->cs_shader->resources[i].usage);
+                               }
+
+                               cs->buf[cs->cdw++] = ctx->cs_shader->resources[i].cs[j];
+                       }
+
+                       if (ctx->cs_shader->resources[i].bo) {
+                               onebo = ctx->cs_shader->resources[i].bo;
+                               evergreen_emit_ctx_reloc(ctx,
+                                       ctx->cs_shader->resources[i].bo,
+                                       ctx->cs_shader->resources[i].usage);
+
+                               ///special case for textures
+                               if (ctx->cs_shader->resources[i].do_reloc
+                                       [ctx->cs_shader->resources[i].cs_end] == 2) {
+                                       evergreen_emit_ctx_reloc(ctx,
+                                               ctx->cs_shader->resources[i].bo,
+                                               ctx->cs_shader->resources[i].usage);
+                               }
+
+                               evergreen_set_buffer_sync(ctx, ctx->cs_shader->resources[i].bo,
+                                       ctx->cs_shader->resources[i].coher_bo_size,
+                                       ctx->cs_shader->resources[i].flags,
+                                       ctx->cs_shader->resources[i].usage);
+                       }
+               }
+       }
+
+#if 0
+       COMPUTE_DBG("cdw: %i\n", cs->cdw);
+       for (i = 0; i < cs->cdw; i++) {
+               COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
+       }
+#endif
+
+       ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC);
+
+       ctx->pm4_dirty_cdwords = 0;
+       ctx->flags = 0;
+
+       COMPUTE_DBG("shader started\n");
+
+       ctx->ws->buffer_wait(onebo->buf, 0);
+
+       COMPUTE_DBG("...\n");
+
+       r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
+
+       ctx->streamout_start = TRUE;
+       ctx->streamout_append_bitmask = ~0;
+
+}
+
+static void evergreen_launch_grid(
+               struct pipe_context *ctx_,
+               const uint *block_layout, const uint *grid_layout,
+               uint32_t pc, const void *input)
+{
+       COMPUTE_DBG("PC: %i\n", pc);
+
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       unsigned num_waves;
+       unsigned num_pipes = ctx->screen->info.r600_max_pipes;
+       unsigned wave_divisor = (16 * num_pipes);
+
+       /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
+       num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
+                       wave_divisor - 1) / wave_divisor;
+
+       COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
+                                                       num_pipes, num_waves);
+
+       evergreen_set_lds(ctx->cs_shader, 0, 0, num_waves);
+       evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
+       evergreen_direct_dispatch(ctx_, block_layout, grid_layout);
+       compute_emit_cs(ctx);
+}
+
+static void evergreen_set_compute_resources(struct pipe_context * ctx_,
+               unsigned start, unsigned count,
+               struct pipe_surface ** surfaces)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       struct r600_surface **resources = (struct r600_surface **)surfaces;
+       for (int i = 0; i < count; i++) {
+               if (resources[i]) {
+                       struct r600_resource_global *buffer =
+                               (struct r600_resource_global*)resources[i]->base.texture;
+                       if (resources[i]->base.writable) {
+                               assert(i+1 < 12);
+                               struct r600_resource_global *buffer =
+                                       (struct r600_resource_global*)
+                                       resources[i]->base.texture;
+
+                               evergreen_set_rat(ctx->cs_shader, i+1,
+                               (struct r600_resource *)resources[i]->base.texture,
+                               buffer->chunk->start_in_dw*4,
+                               resources[i]->base.texture->width0);
+                       }
+
+                       evergreen_set_vtx_resource(ctx->cs_shader,
+                               (struct r600_resource *)resources[i]->base.texture, i+2,
+                                buffer->chunk->start_in_dw*4, resources[i]->base.writable);
+               }
+       }
+
+}
+
+static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
+               unsigned start_slot, unsigned count,
+               struct pipe_sampler_view **views)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       struct r600_pipe_sampler_view **resource =
+               (struct r600_pipe_sampler_view **)views;
+
+       for (int i = 0; i < count; i++) {
+               if (resource[i]) {
+                       assert(i+1 < 12);
+                       ///FETCH0 = VTX0 (param buffer),
+                       //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
+                       evergreen_set_tex_resource(ctx->cs_shader, resource[i], i+2);
+               }
+       }
+}
+
+static void evergreen_bind_compute_sampler_states(
+       struct pipe_context *ctx_,
+       unsigned start_slot,
+       unsigned num_samplers,
+       void **samplers_)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       struct compute_sampler_state ** samplers =
+               (struct compute_sampler_state **)samplers_;
+
+       for (int i = 0; i < num_samplers; i++) {
+               if (samplers[i]) {
+                       evergreen_set_sampler_resource(ctx->cs_shader, samplers[i], i);
+               }
+       }
+}
+
+static void evergreen_set_global_binding(
+       struct pipe_context *ctx_, unsigned first, unsigned n,
+       struct pipe_resource **resources,
+       uint32_t **handles)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       struct compute_memory_pool *pool = ctx->screen->global_pool;
+       struct r600_resource_global **buffers =
+               (struct r600_resource_global **)resources;
+
+       if (!resources) {
+               /* XXX: Unset */
+               return;
+       }
+
+       compute_memory_finalize_pending(pool, ctx_);
+
+       for (int i = 0; i < n; i++)
+       {
+               assert(resources[i]->target == PIPE_BUFFER);
+               assert(resources[i]->bind & PIPE_BIND_GLOBAL);
+
+               *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
+       }
+
+       evergreen_set_rat(ctx->cs_shader, 0, pool->bo, 0, pool->size_in_dw * 4);
+       evergreen_set_vtx_resource(ctx->cs_shader, pool->bo, 1, 0, 1);
+}
+
+
+void evergreen_compute_init_config(struct r600_context *ctx)
+{
+       struct evergreen_compute_resource* res =
+               get_empty_res(ctx->cs_shader, COMPUTE_RESOURCE_CONFIG, 0);
+
+       int num_threads;
+       int num_stack_entries;
+       int num_temp_gprs;
+
+       enum radeon_family family;
+       unsigned tmp;
+
+       family = ctx->family;
+
+       switch (family) {
+       case CHIP_CEDAR:
+       default:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 256;
+               break;
+       case CHIP_REDWOOD:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 256;
+               break;
+       case CHIP_JUNIPER:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 512;
+               break;
+       case CHIP_CYPRESS:
+       case CHIP_HEMLOCK:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 512;
+               break;
+       case CHIP_PALM:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 256;
+               break;
+       case CHIP_SUMO:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 256;
+               break;
+       case CHIP_SUMO2:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 512;
+               break;
+       case CHIP_BARTS:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 512;
+               break;
+       case CHIP_TURKS:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 256;
+               break;
+       case CHIP_CAICOS:
+               num_temp_gprs = 4;
+               num_threads = 128;
+               num_stack_entries = 256;
+               break;
+       }
+
+       tmp = 0x00000000;
+       switch (family) {
+       case CHIP_CEDAR:
+       case CHIP_PALM:
+       case CHIP_SUMO:
+       case CHIP_SUMO2:
+       case CHIP_CAICOS:
+               break;
+       default:
+               tmp |= S_008C00_VC_ENABLE(1);
+               break;
+       }
+       tmp |= S_008C00_EXPORT_SRC_C(1);
+       tmp |= S_008C00_CS_PRIO(0);
+       tmp |= S_008C00_LS_PRIO(0);
+       tmp |= S_008C00_HS_PRIO(0);
+       tmp |= S_008C00_PS_PRIO(0);
+       tmp |= S_008C00_VS_PRIO(0);
+       tmp |= S_008C00_GS_PRIO(0);
+       tmp |= S_008C00_ES_PRIO(0);
+
+       evergreen_reg_set(res, R_008C00_SQ_CONFIG, tmp);
+
+       evergreen_reg_set(res, R_008C04_SQ_GPR_RESOURCE_MGMT_1,
+                               S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs));
+       evergreen_reg_set(res, R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0);
+       evergreen_reg_set(res, R_008C10_SQ_GLOBAL_GPR_RESOURCE_MGMT_1, 0);
+       evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
+       evergreen_reg_set(res, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
+       /* workaround for hw issues with dyn gpr - must set all limits to 240
+        * instead of 0, 0x1e == 240/8 */
+       evergreen_reg_set(res, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
+                               S_028838_PS_GPRS(0x1e) |
+                               S_028838_VS_GPRS(0x1e) |
+                               S_028838_GS_GPRS(0x1e) |
+                               S_028838_ES_GPRS(0x1e) |
+                               S_028838_HS_GPRS(0x1e) |
+                               S_028838_LS_GPRS(0x1e));
+
+
+       evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF);
+       evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF);
+       evergreen_reg_set(res, R_008E28_SQ_STATIC_THREAD_MGMT3, 0xFFFFFFFF);
+       evergreen_reg_set(res, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 0);
+       tmp = S_008C1C_NUM_LS_THREADS(num_threads);
+       evergreen_reg_set(res, R_008C1C_SQ_THREAD_RESOURCE_MGMT_2, tmp);
+       evergreen_reg_set(res, R_008C20_SQ_STACK_RESOURCE_MGMT_1, 0);
+       evergreen_reg_set(res, R_008C24_SQ_STACK_RESOURCE_MGMT_2, 0);
+       tmp = S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries);
+       evergreen_reg_set(res, R_008C28_SQ_STACK_RESOURCE_MGMT_3, tmp);
+       evergreen_reg_set(res, R_0286CC_SPI_PS_IN_CONTROL_0, S_0286CC_LINEAR_GRADIENT_ENA(1));
+       evergreen_reg_set(res, R_0286D0_SPI_PS_IN_CONTROL_1, 0);
+       evergreen_reg_set(res, R_0286E4_SPI_PS_IN_CONTROL_2, 0);
+       evergreen_reg_set(res, R_0286D8_SPI_INPUT_Z, 0);
+       evergreen_reg_set(res, R_0286E0_SPI_BARYC_CNTL, 1 << 20);
+       tmp = S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK;
+       evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, tmp);
+       tmp = S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1);
+       evergreen_reg_set(res, R_028A40_VGT_GS_MODE, tmp);
+       evergreen_reg_set(res, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
+       evergreen_reg_set(res, R_028800_DB_DEPTH_CONTROL, 0);
+       evergreen_reg_set(res, R_02880C_DB_SHADER_CONTROL, 0);
+       evergreen_reg_set(res, R_028000_DB_RENDER_CONTROL, S_028000_COLOR_DISABLE(1));
+       evergreen_reg_set(res, R_02800C_DB_RENDER_OVERRIDE, 0);
+       evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
+                                               S_0286E8_TID_IN_GROUP_ENA
+                                               | S_0286E8_TGID_ENA
+                                               | S_0286E8_DISABLE_INDEX_PACK)
+                                               ;
+}
+
+void evergreen_init_compute_state_functions(struct r600_context *ctx)
+{
+       ctx->context.create_compute_state = evergreen_create_compute_state;
+       ctx->context.delete_compute_state = evergreen_delete_compute_state;
+       ctx->context.bind_compute_state = evergreen_bind_compute_state;
+//      ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
+       ctx->context.set_compute_resources = evergreen_set_compute_resources;
+       ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
+       ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
+       ctx->context.set_global_binding = evergreen_set_global_binding;
+       ctx->context.launch_grid = evergreen_launch_grid;
+}
+
+
+struct pipe_resource *r600_compute_global_buffer_create(
+       struct pipe_screen *screen,
+       const struct pipe_resource *templ)
+{
+       assert(templ->target == PIPE_BUFFER);
+       assert(templ->bind & PIPE_BIND_GLOBAL);
+       assert(templ->array_size == 1 || templ->array_size == 0);
+       assert(templ->depth0 == 1 || templ->depth0 == 0);
+       assert(templ->height0 == 1 || templ->height0 == 0);
+
+       struct r600_resource_global* result = (struct r600_resource_global*)
+               CALLOC(sizeof(struct r600_resource_global), 1);
+       struct r600_screen* rscreen = (struct r600_screen*)screen;
+
+       result->base.b.vtbl = &r600_global_buffer_vtbl;
+       result->base.b.b.screen = screen;
+       result->base.b.b = *templ;
+       pipe_reference_init(&result->base.b.b.reference, 1);
+
+       int size_in_dw = (templ->width0+3) / 4;
+
+       result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
+
+       if (result->chunk == NULL)
+       {
+               free(result);
+               return NULL;
+       }
+
+       return &result->base.b.b;
+}
+
+void r600_compute_global_buffer_destroy(
+       struct pipe_screen *screen,
+       struct pipe_resource *res)
+{
+       assert(res->target == PIPE_BUFFER);
+       assert(res->bind & PIPE_BIND_GLOBAL);
+
+       struct r600_resource_global* buffer = (struct r600_resource_global*)res;
+       struct r600_screen* rscreen = (struct r600_screen*)screen;
+
+       compute_memory_free(rscreen->global_pool, buffer->chunk->id);
+
+       buffer->chunk = NULL;
+       free(res);
+}
+
+void* r600_compute_global_transfer_map(
+       struct pipe_context *ctx_,
+       struct pipe_transfer* transfer)
+{
+       assert(transfer->resource->target == PIPE_BUFFER);
+       assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
+       assert(transfer->box.x >= 0);
+       assert(transfer->box.y == 0);
+       assert(transfer->box.z == 0);
+
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       struct r600_resource_global* buffer =
+               (struct r600_resource_global*)transfer->resource;
+
+       uint32_t* map;
+       ///TODO: do it better, mapping is not possible if the pool is too big
+
+       if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
+                                               ctx->cs, transfer->usage))) {
+               return NULL;
+       }
+
+       COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
+       return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
+}
+
+void r600_compute_global_transfer_unmap(
+       struct pipe_context *ctx_,
+       struct pipe_transfer* transfer)
+{
+       assert(transfer->resource->target == PIPE_BUFFER);
+       assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
+
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       struct r600_resource_global* buffer =
+               (struct r600_resource_global*)transfer->resource;
+
+       ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
+}
+
+struct pipe_transfer * r600_compute_global_get_transfer(
+       struct pipe_context *ctx_,
+       struct pipe_resource *resource,
+       unsigned level,
+       unsigned usage,
+       const struct pipe_box *box)
+{
+       struct r600_context *ctx = (struct r600_context *)ctx_;
+       struct compute_memory_pool *pool = ctx->screen->global_pool;
+
+       compute_memory_finalize_pending(pool, ctx_);
+
+       assert(resource->target == PIPE_BUFFER);
+       struct r600_context *rctx = (struct r600_context*)ctx_;
+       struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
+
+       transfer->resource = resource;
+       transfer->level = level;
+       transfer->usage = usage;
+       transfer->box = *box;
+       transfer->stride = 0;
+       transfer->layer_stride = 0;
+       transfer->data = NULL;
+
+       /* Note strides are zero, this is ok for buffers, but not for
+       * textures 2d & higher at least.
+       */
+       return transfer;
+}
+
+void r600_compute_global_transfer_destroy(
+       struct pipe_context *ctx_,
+       struct pipe_transfer *transfer)
+{
+       struct r600_context *rctx = (struct r600_context*)ctx_;
+       util_slab_free(&rctx->pool_transfers, transfer);
+}
+
+void r600_compute_global_transfer_flush_region(
+       struct pipe_context *ctx_,
+       struct pipe_transfer *transfer,
+       const struct pipe_box *box)
+{
+       assert(0 && "TODO");
+}
+
+void r600_compute_global_transfer_inline_write(
+       struct pipe_context *pipe,
+       struct pipe_resource *resource,
+       unsigned level,
+       unsigned usage,
+       const struct pipe_box *box,
+       const void *data,
+       unsigned stride,
+       unsigned layer_stride)
+{
+       assert(0 && "TODO");
+}
diff --git a/src/gallium/drivers/r600/evergreen_compute.h b/src/gallium/drivers/r600/evergreen_compute.h
new file mode 100644 (file)
index 0000000..a0881cd
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#ifndef EVERGREEN_COMPUTE_H
+#define EVERGREEN_COMPUTE_H
+#include "r600.h"
+#include "r600_pipe.h"
+
+struct evergreen_compute_resource;
+
+void *evergreen_create_compute_state(struct pipe_context *ctx, const const struct pipe_compute_state *cso);
+void evergreen_delete_compute_state(struct pipe_context *ctx, void *state);
+void evergreen_direct_dispatch( struct pipe_context *context, const uint *block_layout, const uint *grid_layout);
+void evergreen_compute_upload_input(struct pipe_context *context, const uint *block_layout, const uint *grid_layout, const void *input);
+void evergreen_compute_init_config(struct r600_context *rctx);
+void evergreen_init_compute_state_functions(struct r600_context *rctx);
+
+struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen, const struct pipe_resource *templ);
+void r600_compute_global_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *res);
+void* r600_compute_global_transfer_map(struct pipe_context *ctx, struct pipe_transfer* transfer);
+void r600_compute_global_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer* transfer);
+struct pipe_transfer * r600_compute_global_get_transfer(struct pipe_context *, struct pipe_resource *, unsigned level,
+                                                        unsigned usage, const struct pipe_box *);
+void r600_compute_global_transfer_destroy(struct pipe_context *, struct pipe_transfer *);
+void r600_compute_global_transfer_flush_region( struct pipe_context *, struct pipe_transfer *, const struct pipe_box *);
+void r600_compute_global_transfer_inline_write( struct pipe_context *, struct pipe_resource *, unsigned level,
+                                                unsigned usage, const struct pipe_box *, const void *data, unsigned stride, unsigned layer_stride);
+
+
+static inline void COMPUTE_DBG(const char *fmt, ...)
+{
+   static bool check_debug = false, debug = false;
+
+   if (!check_debug) {
+               debug = debug_get_bool_option("R600_COMPUTE_DEBUG", FALSE);
+   }
+
+   if (debug) {
+      va_list ap;
+      va_start(ap, fmt);
+      _debug_vprintf(fmt, ap);
+      va_end(ap);
+   }
+}
+
+#endif
diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.c b/src/gallium/drivers/r600/evergreen_compute_internal.c
new file mode 100644 (file)
index 0000000..209f064
--- /dev/null
@@ -0,0 +1,830 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "r600.h"
+#include "r600_resource.h"
+#include "r600_shader.h"
+#include "r600_pipe.h"
+#include "r600_formats.h"
+#include "evergreend.h"
+#include "evergreen_compute_internal.h"
+#include "r600_hw_context_priv.h"
+
+int get_compute_resource_num(void)
+{
+       int num = 0;
+#define DECL_COMPUTE_RESOURCE(name, n) num += n;
+#include "compute_resource.def"
+#undef DECL_COMPUTE_RESOURCE
+       return num;
+}
+
+void evergreen_emit_raw_value(
+       struct evergreen_compute_resource* res,
+       unsigned value)
+{
+       res->cs[res->cs_end++] = value;
+}
+
+void evergreen_emit_ctx_value(struct r600_context *ctx, unsigned value)
+{
+       ctx->cs->buf[ctx->cs->cdw++] = value;
+}
+
+void evergreen_mult_reg_set_(
+       struct evergreen_compute_resource* res,
+       int index,
+       u32* array,
+       int size)
+{
+       int i = 0;
+
+       evergreen_emit_raw_reg_set(res, index, size / 4);
+
+       for (i = 0; i < size; i+=4) {
+               res->cs[res->cs_end++] = array[i / 4];
+       }
+}
+
+void evergreen_reg_set(
+       struct evergreen_compute_resource* res,
+       unsigned index,
+       unsigned value)
+{
+       evergreen_emit_raw_reg_set(res, index, 1);
+       res->cs[res->cs_end++] = value;
+}
+
+struct evergreen_compute_resource* get_empty_res(
+       struct r600_pipe_compute* pipe,
+       enum evergreen_compute_resources res_code,
+       int offset_index)
+{
+       int code_index = -1;
+       int code_size = -1;
+
+       {
+               int i = 0;
+               #define DECL_COMPUTE_RESOURCE(name, n) if (COMPUTE_RESOURCE_ ## name    == res_code) {code_index = i; code_size = n;} i += n;
+               #include "compute_resource.def"
+               #undef DECL_COMPUTE_RESOURCE
+       }
+
+       assert(code_index != -1 && "internal error: resouce index not found");
+       assert(offset_index < code_size && "internal error: overindexing resource");
+
+       int index = code_index + offset_index;
+
+       struct evergreen_compute_resource* res = &pipe->resources[index];
+
+       res->enabled = true;
+       res->bo = NULL;
+       res->cs_end = 0;
+       bzero(&res->do_reloc, sizeof(res->do_reloc));
+
+       return res;
+}
+
+void evergreen_emit_raw_reg_set(
+       struct evergreen_compute_resource* res,
+       unsigned index,
+       int num)
+{
+       res->enabled = 1;
+       int cs_end = res->cs_end;
+
+       if (index >= EVERGREEN_CONFIG_REG_OFFSET
+                       && index < EVERGREEN_CONFIG_REG_END) {
+               res->cs[cs_end] = PKT3C(PKT3_SET_CONFIG_REG, num, 0);
+               res->cs[cs_end+1] = (index - EVERGREEN_CONFIG_REG_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_CONTEXT_REG_OFFSET
+                       && index < EVERGREEN_CONTEXT_REG_END) {
+               res->cs[cs_end] = PKT3C(PKT3_SET_CONTEXT_REG, num, 0);
+               res->cs[cs_end+1] = (index - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_RESOURCE_OFFSET
+                       && index < EVERGREEN_RESOURCE_END) {
+               res->cs[cs_end] = PKT3C(PKT3_SET_RESOURCE, num, 0);
+               res->cs[cs_end+1] = (index - EVERGREEN_RESOURCE_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_SAMPLER_OFFSET
+                       && index < EVERGREEN_SAMPLER_END) {
+               res->cs[cs_end] = PKT3C(PKT3_SET_SAMPLER, num, 0);
+               res->cs[cs_end+1] = (index - EVERGREEN_SAMPLER_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_CTL_CONST_OFFSET
+                       && index < EVERGREEN_CTL_CONST_END) {
+               res->cs[cs_end] = PKT3C(PKT3_SET_CTL_CONST, num, 0);
+               res->cs[cs_end+1] = (index - EVERGREEN_CTL_CONST_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_LOOP_CONST_OFFSET
+                       && index < EVERGREEN_LOOP_CONST_END) {
+               res->cs[cs_end] = PKT3C(PKT3_SET_LOOP_CONST, num, 0);
+               res->cs[cs_end+1] = (index - EVERGREEN_LOOP_CONST_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_BOOL_CONST_OFFSET
+                       && index < EVERGREEN_BOOL_CONST_END) {
+               res->cs[cs_end] = PKT3C(PKT3_SET_BOOL_CONST, num, 0);
+               res->cs[cs_end+1] = (index - EVERGREEN_BOOL_CONST_OFFSET) >> 2;
+       } else {
+               res->cs[cs_end] = PKT0(index, num-1);
+               res->cs_end--;
+       }
+
+       res->cs_end += 2;
+}
+
+void evergreen_emit_force_reloc(struct evergreen_compute_resource* res)
+{
+       res->do_reloc[res->cs_end] += 1;
+}
+
+void evergreen_emit_ctx_reg_set(
+       struct r600_context *ctx,
+       unsigned index,
+       int num)
+{
+
+       if (index >= EVERGREEN_CONFIG_REG_OFFSET
+                       && index < EVERGREEN_CONFIG_REG_END) {
+               ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CONFIG_REG, num, 0);
+               ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CONFIG_REG_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_CONTEXT_REG_OFFSET
+                       && index < EVERGREEN_CONTEXT_REG_END) {
+               ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CONTEXT_REG, num, 0);
+               ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_RESOURCE_OFFSET
+                       && index < EVERGREEN_RESOURCE_END) {
+               ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_RESOURCE, num, 0);
+               ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_RESOURCE_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_SAMPLER_OFFSET
+                       && index < EVERGREEN_SAMPLER_END) {
+               ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_SAMPLER, num, 0);
+               ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_SAMPLER_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_CTL_CONST_OFFSET
+                       && index < EVERGREEN_CTL_CONST_END) {
+               ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CTL_CONST, num, 0);
+               ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CTL_CONST_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_LOOP_CONST_OFFSET
+                       && index < EVERGREEN_LOOP_CONST_END) {
+               ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_LOOP_CONST, num, 0);
+               ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_LOOP_CONST_OFFSET) >> 2;
+       } else if (index >= EVERGREEN_BOOL_CONST_OFFSET
+                       && index < EVERGREEN_BOOL_CONST_END) {
+               ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_BOOL_CONST, num, 0);
+               ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_BOOL_CONST_OFFSET) >> 2;
+       } else {
+               ctx->cs->buf[ctx->cs->cdw++] = PKT0(index, num-1);
+       }
+}
+
+void evergreen_emit_ctx_reloc(
+       struct r600_context *ctx,
+       struct r600_resource *bo,
+       enum radeon_bo_usage usage)
+{
+       assert(bo);
+
+       ctx->cs->buf[ctx->cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+       u32 rr = r600_context_bo_reloc(ctx, bo, usage);
+       ctx->cs->buf[ctx->cs->cdw++] = rr;
+}
+
+void evergreen_set_buffer_sync(
+       struct r600_context *ctx,
+       struct r600_resource* bo,
+       int size,
+       int flags,
+       enum radeon_bo_usage usage)
+{
+       assert(bo);
+       int32_t cp_coher_size = 0;
+
+       if (size == 0xffffffff || size == 0) {
+               cp_coher_size = 0xffffffff;
+       }
+       else {
+               cp_coher_size = ((size + 255) >> 8);
+       }
+
+       uint32_t sync_flags = 0;
+
+       if ((flags & COMPUTE_RES_TC_FLUSH) == COMPUTE_RES_TC_FLUSH) {
+               sync_flags |= S_0085F0_TC_ACTION_ENA(1);
+       }
+
+       if ((flags & COMPUTE_RES_VC_FLUSH) == COMPUTE_RES_VC_FLUSH) {
+               sync_flags |= S_0085F0_VC_ACTION_ENA(1);
+       }
+
+       if ((flags & COMPUTE_RES_SH_FLUSH) == COMPUTE_RES_SH_FLUSH) {
+               sync_flags |= S_0085F0_SH_ACTION_ENA(1);
+       }
+
+       if ((flags & COMPUTE_RES_CB_FLUSH(0)) == COMPUTE_RES_CB_FLUSH(0)) {
+               sync_flags |= S_0085F0_CB_ACTION_ENA(1);
+
+               switch((flags >> 8) & 0xF) {
+               case 0:
+                       sync_flags |= S_0085F0_CB0_DEST_BASE_ENA(1);
+                       break;
+               case 1:
+                       sync_flags |= S_0085F0_CB1_DEST_BASE_ENA(1);
+                       break;
+               case 2:
+                       sync_flags |= S_0085F0_CB2_DEST_BASE_ENA(1);
+                       break;
+               case 3:
+                       sync_flags |= S_0085F0_CB3_DEST_BASE_ENA(1);
+                       break;
+               case 4:
+                       sync_flags |= S_0085F0_CB4_DEST_BASE_ENA(1);
+                       break;
+               case 5:
+                       sync_flags |= S_0085F0_CB5_DEST_BASE_ENA(1);
+                       break;
+               case 6:
+                       sync_flags |= S_0085F0_CB6_DEST_BASE_ENA(1);
+                       break;
+               case 7:
+                       sync_flags |= S_0085F0_CB7_DEST_BASE_ENA(1);
+                       break;
+               case 8:
+                       sync_flags |= S_0085F0_CB8_DEST_BASE_ENA(1);
+                       break;
+               case 9:
+                       sync_flags |= S_0085F0_CB9_DEST_BASE_ENA(1);
+                       break;
+               case 10:
+                       sync_flags |= S_0085F0_CB10_DEST_BASE_ENA(1);
+                       break;
+               case 11:
+                       sync_flags |= S_0085F0_CB11_DEST_BASE_ENA(1);
+                       break;
+               default:
+                       assert(0);
+               }
+       }
+
+       int32_t poll_interval = 10;
+
+       ctx->cs->buf[ctx->cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
+       ctx->cs->buf[ctx->cs->cdw++] = sync_flags;
+       ctx->cs->buf[ctx->cs->cdw++] = cp_coher_size;
+       ctx->cs->buf[ctx->cs->cdw++] = 0;
+       ctx->cs->buf[ctx->cs->cdw++] = poll_interval;
+
+       if (cp_coher_size != 0xffffffff) {
+               evergreen_emit_ctx_reloc(ctx, bo, usage);
+       }
+}
+
+int evergreen_compute_get_gpu_format(
+       struct number_type_and_format* fmt,
+       struct r600_resource *bo)
+{
+       switch (bo->b.b.format)
+       {
+               case PIPE_FORMAT_R8_UNORM:
+               case PIPE_FORMAT_R32_UNORM:
+                       fmt->format = V_028C70_COLOR_32;
+                       fmt->number_type = V_028C70_NUMBER_UNORM;
+                       fmt->num_format_all = 0;
+               break;
+               case PIPE_FORMAT_R32_FLOAT:
+                       fmt->format = V_028C70_COLOR_32_FLOAT;
+                       fmt->number_type = V_028C70_NUMBER_FLOAT;
+                       fmt->num_format_all = 0;
+               break;
+               case PIPE_FORMAT_R32G32B32A32_FLOAT:
+                       fmt->format = V_028C70_COLOR_32_32_32_32_FLOAT;
+                       fmt->number_type = V_028C70_NUMBER_FLOAT;
+                       fmt->num_format_all = 0;
+               break;
+
+               ///TODO: other formats...
+
+               default:
+                       return 0;
+       }
+
+       return 1;
+}
+
+void evergreen_set_rat(
+       struct r600_pipe_compute *pipe,
+       int id,
+       struct r600_resource* bo,
+       int start,
+       int size)
+{
+       assert(id < 12);
+       assert((size & 3) == 0);
+       assert((start & 0xFF) == 0);
+
+       int offset;
+       COMPUTE_DBG("bind rat: %i \n", id);
+
+       if (id < 8) {
+               offset = id*0x3c;
+       }
+       else {
+               offset = 8*0x3c + (id-8)*0x1c;
+       }
+
+       int linear = 0;
+
+       if (bo->b.b.height0 <= 1 && bo->b.b.depth0 <= 1
+                       && bo->b.b.target == PIPE_BUFFER) {
+               linear = 1;
+       }
+
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_RAT, id);
+
+       evergreen_emit_force_reloc(res);
+
+       evergreen_reg_set(res, R_028C64_CB_COLOR0_PITCH, 0); ///TODO: for 2D?
+       evergreen_reg_set(res, R_028C68_CB_COLOR0_SLICE, 0);
+
+       struct number_type_and_format fmt;
+
+       ///default config
+       if (bo->b.b.format == PIPE_FORMAT_NONE) {
+                fmt.format = V_028C70_COLOR_32;
+                fmt.number_type = V_028C70_NUMBER_FLOAT;
+       } else {
+               evergreen_compute_get_gpu_format(&fmt, bo);
+       }
+
+       if (linear) {
+               evergreen_reg_set(res,
+                       R_028C70_CB_COLOR0_INFO, S_028C70_RAT(1)
+                       | S_028C70_ARRAY_MODE(V_028C70_ARRAY_LINEAR_ALIGNED)
+                       | S_028C70_FORMAT(fmt.format)
+                       | S_028C70_NUMBER_TYPE(fmt.number_type)
+               );
+               evergreen_emit_force_reloc(res);
+       } else {
+               assert(0 && "TODO");
+               ///TODO
+//      evergreen_reg_set(res, R_028C70_CB_COLOR0_INFO, S_028C70_RAT(1) | S_028C70_ARRAY_MODE(????));
+//      evergreen_emit_force_reloc(res);
+       }
+
+       evergreen_reg_set(res, R_028C74_CB_COLOR0_ATTRIB, S_028C74_NON_DISP_TILING_ORDER(1));
+       evergreen_emit_force_reloc(res);
+
+       if (linear) {
+               /* XXX: Why are we using size instead of bo->b.b.b.width0 ? */
+               evergreen_reg_set(res, R_028C78_CB_COLOR0_DIM, size);
+       } else {
+               evergreen_reg_set(res, R_028C78_CB_COLOR0_DIM,
+                       S_028C78_WIDTH_MAX(bo->b.b.width0)
+                       | S_028C78_HEIGHT_MAX(bo->b.b.height0));
+       }
+
+       if (id < 8) {
+               evergreen_reg_set(res, R_028C7C_CB_COLOR0_CMASK, 0);
+               evergreen_emit_force_reloc(res);
+               evergreen_reg_set(res, R_028C84_CB_COLOR0_FMASK, 0);
+               evergreen_emit_force_reloc(res);
+       }
+
+       evergreen_reg_set(res, R_028C60_CB_COLOR0_BASE + offset, start >> 8);
+
+       res->bo = bo;
+       res->usage = RADEON_USAGE_READWRITE;
+       res->coher_bo_size = size;
+       res->flags = COMPUTE_RES_CB_FLUSH(id);
+}
+
+void evergreen_set_lds(
+       struct r600_pipe_compute *pipe,
+       int num_lds,
+       int size,
+       int num_waves)
+{
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_LDS, 0);
+
+       evergreen_reg_set(res, R_008E2C_SQ_LDS_RESOURCE_MGMT,
+               S_008E2C_NUM_LS_LDS(num_lds));
+       evergreen_reg_set(res, CM_R_0288E8_SQ_LDS_ALLOC, size | num_waves << 14);
+}
+
+void evergreen_set_gds(
+       struct r600_pipe_compute *pipe,
+       uint32_t addr,
+       uint32_t size)
+{
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_GDS, 0);
+
+       evergreen_reg_set(res, R_028728_GDS_ORDERED_WAVE_PER_SE, 1);
+       evergreen_reg_set(res, R_028720_GDS_ADDR_BASE, addr);
+       evergreen_reg_set(res, R_028724_GDS_ADDR_SIZE, size);
+}
+
+void evergreen_set_export(
+       struct r600_pipe_compute *pipe,
+       struct r600_resource* bo,
+       int offset, int size)
+{
+       #define SX_MEMORY_EXPORT_BASE 0x9010
+       #define SX_MEMORY_EXPORT_SIZE 0x9014
+
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_EXPORT, 0);
+
+       evergreen_reg_set(res, SX_MEMORY_EXPORT_SIZE, size);
+
+       if (size) {
+               evergreen_reg_set(res, SX_MEMORY_EXPORT_BASE, offset);
+               res->bo = bo;
+               res->usage = RADEON_USAGE_WRITE;
+               res->coher_bo_size = size;
+               res->flags = 0;
+       }
+}
+
+void evergreen_set_loop_const(
+       struct r600_pipe_compute *pipe,
+       int id, int count, int init, int inc) {
+
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_LOOP, id);
+
+       assert(id < 32);
+       assert(count <= 0xFFF);
+       assert(init <= 0xFF);
+       assert(inc <= 0xFF);
+
+       /* Compute shaders use LOOP_CONST registers SQ_LOOP_CONST_160 to
+         * SQ_LOOP_CONST_191 */
+       evergreen_reg_set(res, R_03A200_SQ_LOOP_CONST_0 + (160 * 4) + (id * 4),
+               count | init << 12 | inc << 24);
+}
+
+void evergreen_set_tmp_ring(
+       struct r600_pipe_compute *pipe,
+       struct r600_resource* bo,
+       int offset, int size, int se)
+{
+       #define SQ_LSTMP_RING_BASE 0x00008e10
+       #define SQ_LSTMP_RING_SIZE 0x00008e14
+       #define GRBM_GFX_INDEX                                  0x802C
+       #define         INSTANCE_INDEX(x)                       ((x) << 0)
+       #define         SE_INDEX(x)                             ((x) << 16)
+       #define         INSTANCE_BROADCAST_WRITES               (1 << 30)
+       #define         SE_BROADCAST_WRITES                     (1 << 31)
+
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_TMPRING, se);
+
+       evergreen_reg_set(res,
+               GRBM_GFX_INDEX,INSTANCE_INDEX(0)
+               | SE_INDEX(se)
+               | INSTANCE_BROADCAST_WRITES);
+       evergreen_reg_set(res, SQ_LSTMP_RING_SIZE, size);
+
+       if (size) {
+               assert(bo);
+
+               evergreen_reg_set(res, SQ_LSTMP_RING_BASE, offset);
+               res->bo = bo;
+               res->usage = RADEON_USAGE_WRITE;
+               res->coher_bo_size = 0;
+               res->flags = 0;
+       }
+
+       if (size) {
+               evergreen_emit_force_reloc(res);
+       }
+
+       evergreen_reg_set(res,
+               GRBM_GFX_INDEX,INSTANCE_INDEX(0)
+               | SE_INDEX(0)
+               | INSTANCE_BROADCAST_WRITES
+               | SE_BROADCAST_WRITES);
+}
+
+static uint32_t r600_colorformat_endian_swap(uint32_t colorformat)
+{
+       if (R600_BIG_ENDIAN) {
+               switch(colorformat) {
+               case V_028C70_COLOR_4_4:
+                       return ENDIAN_NONE;
+
+               /* 8-bit buffers. */
+               case V_028C70_COLOR_8:
+                       return ENDIAN_NONE;
+
+               /* 16-bit buffers. */
+               case V_028C70_COLOR_5_6_5:
+               case V_028C70_COLOR_1_5_5_5:
+               case V_028C70_COLOR_4_4_4_4:
+               case V_028C70_COLOR_16:
+               case V_028C70_COLOR_8_8:
+                       return ENDIAN_8IN16;
+
+               /* 32-bit buffers. */
+               case V_028C70_COLOR_8_8_8_8:
+               case V_028C70_COLOR_2_10_10_10:
+               case V_028C70_COLOR_8_24:
+               case V_028C70_COLOR_24_8:
+               case V_028C70_COLOR_32_FLOAT:
+               case V_028C70_COLOR_16_16_FLOAT:
+               case V_028C70_COLOR_16_16:
+                       return ENDIAN_8IN32;
+
+               /* 64-bit buffers. */
+               case V_028C70_COLOR_16_16_16_16:
+               case V_028C70_COLOR_16_16_16_16_FLOAT:
+                       return ENDIAN_8IN16;
+
+               case V_028C70_COLOR_32_32_FLOAT:
+               case V_028C70_COLOR_32_32:
+               case V_028C70_COLOR_X24_8_32_FLOAT:
+                       return ENDIAN_8IN32;
+
+               /* 96-bit buffers. */
+               case V_028C70_COLOR_32_32_32_FLOAT:
+               /* 128-bit buffers. */
+               case V_028C70_COLOR_32_32_32_32_FLOAT:
+               case V_028C70_COLOR_32_32_32_32:
+                       return ENDIAN_8IN32;
+               default:
+                       return ENDIAN_NONE; /* Unsupported. */
+               }
+       } else {
+               return ENDIAN_NONE;
+       }
+}
+
+static unsigned r600_tex_dim(unsigned dim)
+{
+       switch (dim) {
+       default:
+       case PIPE_TEXTURE_1D:
+               return V_030000_SQ_TEX_DIM_1D;
+       case PIPE_TEXTURE_1D_ARRAY:
+               return V_030000_SQ_TEX_DIM_1D_ARRAY;
+       case PIPE_TEXTURE_2D:
+       case PIPE_TEXTURE_RECT:
+               return V_030000_SQ_TEX_DIM_2D;
+       case PIPE_TEXTURE_2D_ARRAY:
+               return V_030000_SQ_TEX_DIM_2D_ARRAY;
+       case PIPE_TEXTURE_3D:
+               return V_030000_SQ_TEX_DIM_3D;
+       case PIPE_TEXTURE_CUBE:
+               return V_030000_SQ_TEX_DIM_CUBEMAP;
+       }
+}
+
+void evergreen_set_vtx_resource(
+       struct r600_pipe_compute *pipe,
+       struct r600_resource* bo,
+       int id, uint64_t offset, int writable)
+{
+       assert(id < 16);
+       uint32_t sq_vtx_constant_word2, sq_vtx_constant_word3, sq_vtx_constant_word4;
+       struct number_type_and_format fmt;
+
+       fmt.format = 0;
+
+       assert(bo->b.b.height0 <= 1);
+       assert(bo->b.b.depth0 <= 1);
+
+       int e = evergreen_compute_get_gpu_format(&fmt, bo);
+
+       assert(e && "unknown format");
+
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_VERT, id);
+
+       unsigned size = bo->b.b.width0;
+       unsigned stride = 1;
+
+//     size = (size * util_format_get_blockwidth(bo->b.b.b.format) *
+//             util_format_get_blocksize(bo->b.b.b.format));
+
+       COMPUTE_DBG("id: %i vtx size: %i byte,  width0: %i elem\n",
+               id, size, bo->b.b.width0);
+
+       sq_vtx_constant_word2 =
+               S_030008_BASE_ADDRESS_HI(offset >> 32) |
+               S_030008_STRIDE(stride) |
+               S_030008_DATA_FORMAT(fmt.format) |
+               S_030008_NUM_FORMAT_ALL(fmt.num_format_all) |
+               S_030008_ENDIAN_SWAP(0);
+
+       COMPUTE_DBG("%08X %i %i %i %i\n", sq_vtx_constant_word2, offset,
+                       stride, fmt.format, fmt.num_format_all);
+
+       sq_vtx_constant_word3 =
+               S_03000C_DST_SEL_X(0) |
+               S_03000C_DST_SEL_Y(1) |
+               S_03000C_DST_SEL_Z(2) |
+               S_03000C_DST_SEL_W(3);
+
+       sq_vtx_constant_word4 = 0;
+
+       evergreen_emit_raw_value(res, PKT3C(PKT3_SET_RESOURCE, 8, 0));
+       evergreen_emit_raw_value(res, (id+816)*32 >> 2);
+       evergreen_emit_raw_value(res, (unsigned)((offset) & 0xffffffff));
+       evergreen_emit_raw_value(res, size - 1);
+       evergreen_emit_raw_value(res, sq_vtx_constant_word2);
+       evergreen_emit_raw_value(res, sq_vtx_constant_word3);
+       evergreen_emit_raw_value(res, sq_vtx_constant_word4);
+       evergreen_emit_raw_value(res, 0);
+       evergreen_emit_raw_value(res, 0);
+       evergreen_emit_raw_value(res, S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER));
+
+       res->bo = bo;
+
+       if (writable) {
+               res->usage = RADEON_USAGE_READWRITE;
+       }
+       else {
+               res->usage = RADEON_USAGE_READ;
+       }
+
+       res->coher_bo_size = size;
+       res->flags = COMPUTE_RES_TC_FLUSH | COMPUTE_RES_VC_FLUSH;
+}
+
+void evergreen_set_tex_resource(
+       struct r600_pipe_compute *pipe,
+       struct r600_pipe_sampler_view* view,
+       int id)
+{
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_TEX, id);
+       struct r600_resource_texture *tmp =
+               (struct r600_resource_texture*)view->base.texture;
+
+       unsigned format, endian;
+       uint32_t word4 = 0, yuv_format = 0, pitch = 0;
+       unsigned char swizzle[4], array_mode = 0, tile_type = 0;
+       unsigned height, depth;
+
+       swizzle[0] = 0;
+       swizzle[1] = 1;
+       swizzle[2] = 2;
+       swizzle[3] = 3;
+
+       format = r600_translate_texformat((struct pipe_screen *)pipe->ctx->screen,
+               view->base.format, swizzle, &word4, &yuv_format);
+
+       if (format == ~0) {
+               format = 0;
+       }
+
+       endian = r600_colorformat_endian_swap(format);
+
+       height = view->base.texture->height0;
+       depth = view->base.texture->depth0;
+
+       pitch = align(tmp->pitch_in_blocks[0] *
+               util_format_get_blockwidth(tmp->real_format), 8);
+       array_mode = tmp->array_mode[0];
+       tile_type = tmp->tile_type;
+
+       assert(view->base.texture->target != PIPE_TEXTURE_1D_ARRAY);
+       assert(view->base.texture->target != PIPE_TEXTURE_2D_ARRAY);
+
+       evergreen_emit_raw_value(res, PKT3C(PKT3_SET_RESOURCE, 8, 0));
+       evergreen_emit_raw_value(res, (id+816)*32 >> 2); ///TODO: check this line
+       evergreen_emit_raw_value(res,
+                               (S_030000_DIM(r600_tex_dim(view->base.texture->target)) |
+                               S_030000_PITCH((pitch / 8) - 1) |
+                               S_030000_NON_DISP_TILING_ORDER(tile_type) |
+                               S_030000_TEX_WIDTH(view->base.texture->width0 - 1)));
+       evergreen_emit_raw_value(res, (S_030004_TEX_HEIGHT(height - 1) |
+                               S_030004_TEX_DEPTH(depth - 1) |
+                               S_030004_ARRAY_MODE(array_mode)));
+       evergreen_emit_raw_value(res, tmp->offset[0] >> 8);
+       evergreen_emit_raw_value(res, tmp->offset[0] >> 8);
+       evergreen_emit_raw_value(res, (word4 |
+                               S_030010_SRF_MODE_ALL(V_030010_SRF_MODE_ZERO_CLAMP_MINUS_ONE) |
+                               S_030010_ENDIAN_SWAP(endian) |
+                               S_030010_BASE_LEVEL(0)));
+       evergreen_emit_raw_value(res, (S_030014_LAST_LEVEL(0) |
+                               S_030014_BASE_ARRAY(0) |
+                               S_030014_LAST_ARRAY(0)));
+       evergreen_emit_raw_value(res, (S_030018_MAX_ANISO(4 /* max 16 samples */)));
+       evergreen_emit_raw_value(res,
+               S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_TEXTURE)
+               | S_03001C_DATA_FORMAT(format));
+
+       res->bo = (struct r600_resource*)view->base.texture;
+
+       res->usage = RADEON_USAGE_READ;
+
+       res->coher_bo_size = tmp->offset[0] + util_format_get_blockwidth(tmp->real_format)*view->base.texture->width0*height*depth;
+       res->flags = COMPUTE_RES_TC_FLUSH;
+
+       evergreen_emit_force_reloc(res);
+       evergreen_emit_force_reloc(res);
+}
+
+void evergreen_set_sampler_resource(
+       struct r600_pipe_compute *pipe,
+       struct compute_sampler_state *sampler,
+       int id)
+{
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_SAMPLER, id);
+
+       unsigned aniso_flag_offset = sampler->state.max_anisotropy > 1 ? 2 : 0;
+
+       evergreen_emit_raw_value(res, PKT3C(PKT3_SET_SAMPLER, 3, 0));
+       evergreen_emit_raw_value(res, (id + 90)*3);
+       evergreen_emit_raw_value(res,
+               S_03C000_CLAMP_X(r600_tex_wrap(sampler->state.wrap_s)) |
+               S_03C000_CLAMP_Y(r600_tex_wrap(sampler->state.wrap_t)) |
+               S_03C000_CLAMP_Z(r600_tex_wrap(sampler->state.wrap_r)) |
+               S_03C000_XY_MAG_FILTER(r600_tex_filter(sampler->state.mag_img_filter) | aniso_flag_offset) |
+               S_03C000_XY_MIN_FILTER(r600_tex_filter(sampler->state.min_img_filter) | aniso_flag_offset) |
+               S_03C000_BORDER_COLOR_TYPE(V_03C000_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK)
+       );
+       evergreen_emit_raw_value(res,
+               S_03C004_MIN_LOD(S_FIXED(CLAMP(sampler->state.min_lod, 0, 15), 8)) |
+               S_03C004_MAX_LOD(S_FIXED(CLAMP(sampler->state.max_lod, 0, 15), 8))
+       );
+       evergreen_emit_raw_value(res,
+               S_03C008_LOD_BIAS(S_FIXED(CLAMP(sampler->state.lod_bias, -16, 16), 8)) |
+               (sampler->state.seamless_cube_map ? 0 : S_03C008_DISABLE_CUBE_WRAP(1)) |
+               S_03C008_TYPE(1)
+       );
+}
+
+void evergreen_set_const_cache(
+       struct r600_pipe_compute *pipe,
+       int cache_id,
+       struct r600_resource* cbo,
+       int size, int offset)
+{
+       #define SQ_ALU_CONST_BUFFER_SIZE_LS_0 0x00028fc0
+       #define SQ_ALU_CONST_CACHE_LS_0 0x00028f40
+
+       struct evergreen_compute_resource* res =
+               get_empty_res(pipe, COMPUTE_RESOURCE_CONST_MEM, cache_id);
+
+       assert(size < 0x200);
+       assert((offset & 0xFF) == 0);
+       assert(cache_id < 16);
+
+       evergreen_reg_set(res, SQ_ALU_CONST_BUFFER_SIZE_LS_0 + cache_id*4, size);
+       evergreen_reg_set(res, SQ_ALU_CONST_CACHE_LS_0 + cache_id*4, offset >> 8);
+       res->bo = cbo;
+       res->usage = RADEON_USAGE_READ;
+       res->coher_bo_size = size;
+       res->flags = COMPUTE_RES_SH_FLUSH;
+}
+
+struct r600_resource* r600_compute_buffer_alloc_vram(
+       struct r600_screen *screen,
+       unsigned size)
+{
+       assert(size);
+
+       struct pipe_resource * buffer = pipe_buffer_create(
+                       (struct pipe_screen*) screen,
+                       PIPE_BIND_CUSTOM,
+                       PIPE_USAGE_IMMUTABLE,
+                       size);
+
+       return (struct r600_resource *)buffer;
+}
diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.h b/src/gallium/drivers/r600/evergreen_compute_internal.h
new file mode 100644 (file)
index 0000000..340ff4b
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+#ifndef EVERGREEN_COMPUTE_INTERNAL_H
+#define EVERGREEN_COMPUTE_INTERNAL_H
+
+#include "compute_memory_pool.h"
+
+enum evergreen_compute_resources
+{
+#define DECL_COMPUTE_RESOURCE(name, n) COMPUTE_RESOURCE_ ## name ,
+#include "compute_resource.def"
+#undef DECL_COMPUTE_RESOURCE
+__COMPUTE_RESOURCE_END__
+};
+
+typedef unsigned u32;
+
+#define COMPUTE_RES_TC_FLUSH      0xF0001
+#define COMPUTE_RES_VC_FLUSH      0xF0002
+#define COMPUTE_RES_SH_FLUSH      0xF0004
+#define COMPUTE_RES_CB_FLUSH(x)  (0xF0008 | x << 8)
+#define COMPUTE_RES_FULL_FLUSH    0xF0010
+
+struct evergreen_compute_resource {
+       int enabled;
+
+       int do_reloc[256];
+       u32 cs[256];
+       int cs_end;
+
+       struct r600_resource *bo;
+       int coher_bo_size;
+       enum radeon_bo_usage usage;
+       int flags; ///flags for COMPUTE_RES_*_FLUSH
+};
+
+struct compute_sampler_state {
+       struct r600_pipe_state base;
+       struct pipe_sampler_state state;
+};
+
+struct number_type_and_format {
+       unsigned format;
+       unsigned number_type;
+       unsigned num_format_all;
+};
+
+struct r600_pipe_compute {
+       struct r600_context *ctx;
+       struct r600_bytecode bc;
+       struct tgsi_token *tokens;
+
+       struct evergreen_compute_resource *resources;
+
+       unsigned local_size;
+       unsigned private_size;
+       unsigned input_size;
+#ifdef HAVE_OPENCL
+       LLVMModuleRef mod;
+#endif
+       struct r600_resource *kernel_param;
+       struct r600_resource *shader_code_bo;
+};
+
+int evergreen_compute_get_gpu_format(struct number_type_and_format* fmt, struct r600_resource *bo); ///get hw format from resource, return 0 on faliure, nonzero on success
+
+
+void evergreen_emit_raw_reg_set(struct evergreen_compute_resource* res, unsigned index, int num);
+void evergreen_emit_ctx_reg_set(struct r600_context *ctx, unsigned index, int num);
+void evergreen_emit_raw_value(struct evergreen_compute_resource* res, unsigned value);
+void evergreen_emit_ctx_value(struct r600_context *ctx, unsigned value);
+void evergreen_mult_reg_set_(struct evergreen_compute_resource* res,  int index, u32* array, int size);
+void evergreen_emit_ctx_reloc(struct r600_context *ctx, struct r600_resource *bo, enum radeon_bo_usage usage);
+void evergreen_reg_set(struct evergreen_compute_resource* res, unsigned index, unsigned value);
+void evergreen_emit_force_reloc(struct evergreen_compute_resource* res);
+
+void evergreen_set_buffer_sync(struct r600_context *ctx, struct r600_resource* bo, int size, int flags, enum radeon_bo_usage usage);
+
+struct evergreen_compute_resource* get_empty_res(struct r600_pipe_compute*, enum evergreen_compute_resources res_code, int index);
+int get_compute_resource_num(void);
+
+#define evergreen_mult_reg_set(res, index, array) evergreen_mult_reg_set_(res, index, array, sizeof(array))
+
+void evergreen_set_rat(struct r600_pipe_compute *pipe, int id, struct r600_resource* bo, int start, int size);
+void evergreen_set_lds(struct r600_pipe_compute *pipe, int num_lds, int size, int num_waves);
+void evergreen_set_gds(struct r600_pipe_compute *pipe, uint32_t addr, uint32_t size);
+void evergreen_set_export(struct r600_pipe_compute *pipe, struct r600_resource* bo, int offset, int size);
+void evergreen_set_loop_const(struct r600_pipe_compute *pipe, int id, int count, int init, int inc);
+void evergreen_set_tmp_ring(struct r600_pipe_compute *pipe, struct r600_resource* bo, int offset, int size, int se);
+void evergreen_set_vtx_resource(struct r600_pipe_compute *pipe, struct r600_resource* bo, int id, uint64_t offset, int writable);
+void evergreen_set_tex_resource(struct r600_pipe_compute *pipe, struct r600_pipe_sampler_view* view, int id);
+void evergreen_set_sampler_resource(struct r600_pipe_compute *pipe, struct compute_sampler_state *sampler, int id);
+void evergreen_set_const_cache(struct r600_pipe_compute *pipe, int cache_id, struct r600_resource* cbo, int size, int offset);
+
+struct r600_resource* r600_compute_buffer_alloc_vram(struct r600_screen *screen, unsigned size);
+
+#endif
index ec0afe52adf49ad89b9930c1c66e9e1db614e09f..b618ca881ba0b012adc79a45d7e7e76d56588474 100644 (file)
@@ -28,6 +28,7 @@
 #include "util/u_memory.h"
 #include "util/u_framebuffer.h"
 #include "util/u_dual_blend.h"
+#include "evergreen_compute.h"
 
 static uint32_t eg_num_banks(uint32_t nbanks)
 {
@@ -1881,6 +1882,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
        rctx->context.create_stream_output_target = r600_create_so_target;
        rctx->context.stream_output_target_destroy = r600_so_target_destroy;
        rctx->context.set_stream_output_targets = r600_set_so_targets;
+       evergreen_init_compute_state_functions(rctx);
 }
 
 static void cayman_init_atom_start_cs(struct r600_context *rctx)
index 105d80f061d00ba6516d774e8ac574a4aa9ce6af..3b6d7304551911cea855f9a195edbd06750a93a2 100644 (file)
@@ -61,6 +61,8 @@
 #define R600_TEXEL_PITCH_ALIGNMENT_MASK        0x7
 
 #define PKT3_NOP                               0x10
+#define PKT3_DISPATCH_DIRECT                   0x15
+#define PKT3_DISPATCH_INDIRECT                 0x16
 #define PKT3_INDIRECT_BUFFER_END               0x17
 #define PKT3_SET_PREDICATION                   0x20
 #define PKT3_REG_RMW                           0x21
 #define PKT3_PREDICATE(x)               (((x) >> 0) & 0x1)
 #define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
 
+#define RADEON_CP_PACKET3_COMPUTE_MODE 0x00000002
+
+/*Evergreen Compute packet3*/
+#define PKT3C(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate) | RADEON_CP_PACKET3_COMPUTE_MODE)
+
 /* Registers */
 #define R_0084FC_CP_STRMOUT_CNTL                    0x000084FC
 #define   S_0084FC_OFFSET_UPDATE_DONE(x)               (((x) & 0x1) << 0)
 #define   G_008CF0_ALU_UPDATE_FIFO_HIWATER(x)          (((x) >> 24) & 0x1F)
 #define   C_008CF0_ALU_UPDATE_FIFO_HIWATER(x)          0xE0FFFFFF
 
+#define R_008E20_SQ_STATIC_THREAD_MGMT1               0x8E20
+#define R_008E24_SQ_STATIC_THREAD_MGMT2               0x8E24
+#define R_008E28_SQ_STATIC_THREAD_MGMT3               0x8E28
+
+#define   R_00899C_VGT_COMPUTE_START_X                 0x0000899C
+#define   R_0089A0_VGT_COMPUTE_START_Y                 0x000089A0
+#define   R_0089A4_VGT_COMPUTE_START_Z                 0x000089A4
+#define   R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE       0x000089AC
+
 #define R_009100_SPI_CONFIG_CNTL                      0x00009100
 #define R_00913C_SPI_CONFIG_CNTL_1                    0x0000913C
 #define   S_00913C_VTX_DONE_DELAY(x)                (((x) & 0xF) << 0)
 #define   G_028410_ALPHA_TEST_BYPASS(x)                (((x) >> 8) & 0x1)
 #define   C_028410_ALPHA_TEST_BYPASS                   0xFFFFFEFF
 
+#define R_0286EC_SPI_COMPUTE_NUM_THREAD_X            0x0286EC
+#define R_0286F0_SPI_COMPUTE_NUM_THREAD_Y            0x0286F0
+#define R_0286F4_SPI_COMPUTE_NUM_THREAD_Z            0x0286F4
+#define R_028B74_VGT_DISPATCH_INITIATOR              0x028B74
+
 #define R_028800_DB_DEPTH_CONTROL                    0x028800
 #define   S_028800_STENCIL_ENABLE(x)                   (((x) & 0x1) << 0)
 #define   G_028800_STENCIL_ENABLE(x)                   (((x) >> 0) & 0x1)
 #define   S_028A40_CUT_MODE(x)                         (((x) & 0x3) << 3)
 #define   G_028A40_CUT_MODE(x)                         (((x) >> 3) & 0x3)
 #define   C_028A40_CUT_MODE                            0xFFFFFFE7
+#define   S_028A40_COMPUTE_MODE(x)                     (x << 14)
+#define   S_028A40_PARTIAL_THD_AT_EOI(x)               (x << 17)
 #define R_028A6C_VGT_GS_OUT_PRIM_TYPE                0x028A6C
 #define   S_028A6C_OUTPRIM_TYPE(x)                     (((x) & 0x3F) << 0)
 #define     V_028A6C_OUTPRIM_TYPE_POINTLIST            0
 #define   G_028848_ALLOW_DOUBLE_DENORM_OUT(x)          (((x) >> 7) & 0x1)
 #define   C_028848_ALLOW_DOUBLE_DENORM_OUT             0xFFFFFF7F
 
+#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
+#define   S_0288D4_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   G_0288D4_NUM_GPRS(x)                         (((x) >> 0) & 0xFF)
+#define   C_0288D4_NUM_GPRS                            0xFFFFFF00
+#define   S_0288D4_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+#define   G_0288D4_STACK_SIZE(x)                       (((x) >> 8) & 0xFF)
+#define   C_0288D4_STACK_SIZE                          0xFFFF00FF
+#define   S_0288D4_DX10_CLAMP(x)                       (((x) & 0x1) << 21)
+#define   G_0288D4_DX10_CLAMP(x)                       (((x) >> 21) & 0x1)
+#define   C_0288D4_DX10_CLAMP                          0xFFDFFFFF
+#define   S_0288D4_PRIME_CACHE_ON_DRAW(x)              (((x) & 0x1) << 23)
+#define   G_0288D4_PRIME_CACHE_ON_DRAW(x)              (((x) >> 23) & 0x1)
+#define   S_0288D4_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
+#define   G_0288D4_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
+#define   C_0288D4_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define   S_0288D4_CLAMP_CONSTS(x)                     (((x) & 0x1) << 31)
+#define   G_0288D4_CLAMP_CONSTS(x)                     (((x) >> 31) & 0x1)
+#define   C_0288D4_CLAMP_CONSTS                        0x7FFFFFFF
+
+#define R_0288D8_SQ_PGM_RESOURCES_LS_2               0x0288d8
+
+
+#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
+#define   S_0288D4_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   G_0288D4_NUM_GPRS(x)                         (((x) >> 0) & 0xFF)
+#define   C_0288D4_NUM_GPRS                            0xFFFFFF00
+#define   S_0288D4_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+#define   G_0288D4_STACK_SIZE(x)                       (((x) >> 8) & 0xFF)
+#define   C_0288D4_STACK_SIZE                          0xFFFF00FF
+#define   S_0288D4_DX10_CLAMP(x)                       (((x) & 0x1) << 21)
+#define   G_0288D4_DX10_CLAMP(x)                       (((x) >> 21) & 0x1)
+#define   C_0288D4_DX10_CLAMP                          0xFFDFFFFF
+#define   S_0288D4_PRIME_CACHE_ON_DRAW(x)              (((x) & 0x1) << 23)
+#define   G_0288D4_PRIME_CACHE_ON_DRAW(x)              (((x) >> 23) & 0x1)
+#define   S_0288D4_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
+#define   G_0288D4_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
+#define   C_0288D4_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define   S_0288D4_CLAMP_CONSTS(x)                     (((x) & 0x1) << 31)
+#define   G_0288D4_CLAMP_CONSTS(x)                     (((x) >> 31) & 0x1)
+#define   C_0288D4_CLAMP_CONSTS                        0x7FFFFFFF
+
+#define R_0288D8_SQ_PGM_RESOURCES_LS_2               0x0288d8
+
+
 #define R_028644_SPI_PS_INPUT_CNTL_0                 0x028644
 #define   S_028644_SEMANTIC(x)                         (((x) & 0xFF) << 0)
 #define   G_028644_SEMANTIC(x)                         (((x) >> 0) & 0xFF)
 #define R_0286DC_SPI_FOG_CNTL                        0x000286DC
 #define R_0286E4_SPI_PS_IN_CONTROL_2                 0x000286E4
 #define R_0286E8_SPI_COMPUTE_INPUT_CNTL              0x000286E8
+#define   S_0286E8_TID_IN_GROUP_ENA                  1
+#define   S_0286E8_TGID_ENA                          2
+#define   S_0286E8_DISABLE_INDEX_PACK                4
+#define R_028720_GDS_ADDR_BASE                       0x00028720
+#define R_028724_GDS_ADDR_SIZE                       0x00028724
+#define R_028728_GDS_ORDERED_WAVE_PER_SE             0x00028728
 #define R_028784_CB_BLEND1_CONTROL                   0x00028784
 #define R_028788_CB_BLEND2_CONTROL                   0x00028788
 #define R_02878C_CB_BLEND3_CONTROL                   0x0002878C
 #define   C_02884C_EXPORT_Z                            0xFFFFFFFE
 #define R_02885C_SQ_PGM_START_VS                     0x0002885C
 #define R_0288A4_SQ_PGM_START_FS                     0x000288A4
+#define R_0288D0_SQ_PGM_START_LS                     0x000288d0
 #define R_0288A8_SQ_PGM_RESOURCES_FS                 0x000288A8
 #define R_0288EC_SQ_LDS_ALLOC_PS                     0x000288EC
 #define R_028900_SQ_ESGS_RING_ITEMSIZE               0x00028900
diff --git a/src/gallium/drivers/r600/llvm_wrapper.cpp b/src/gallium/drivers/r600/llvm_wrapper.cpp
new file mode 100644 (file)
index 0000000..174fb01
--- /dev/null
@@ -0,0 +1,19 @@
+#include <llvm/ADT/OwningPtr.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Support/IRReader.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SourceMgr.h>
+
+#include "llvm_wrapper.h"
+
+
+extern "C" LLVMModuleRef llvm_parse_bitcode(const unsigned char * bitcode, unsigned bitcode_len)
+{
+       llvm::OwningPtr<llvm::Module> M;
+       llvm::StringRef str((const char*)bitcode, bitcode_len);
+       llvm::MemoryBuffer*  buffer = llvm::MemoryBuffer::getMemBufferCopy(str);
+       llvm::SMDiagnostic Err;
+       M.reset(llvm::ParseIR(buffer, Err, llvm::getGlobalContext()));
+       return wrap(M.take());
+}
diff --git a/src/gallium/drivers/r600/llvm_wrapper.h b/src/gallium/drivers/r600/llvm_wrapper.h
new file mode 100644 (file)
index 0000000..3a69645
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef LLVM_WRAPPER_H
+#define LLVM_WRAPPER_H
+
+#include <llvm-c/Core.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LLVMModuleRef llvm_parse_bitcode(const unsigned char * bitcode, unsigned bitcode_len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
index 0f6a1f8834140f8b2c7fc685061f47e14b8fed99..090d909a475e634598672de4dfe5db43923a1672 100644 (file)
@@ -2,7 +2,7 @@
 #ifndef R600_LLVM_H
 #define R600_LLVM_H
 
-#ifdef R600_USE_LLVM
+#if defined R600_USE_LLVM || defined HAVE_OPENCL
 
 #include "radeon_llvm.h"
 #include <llvm-c/Core.h>
@@ -24,6 +24,6 @@ unsigned r600_llvm_compile(
        enum radeon_family family,
        unsigned dump);
 
-#endif /* R600_USE_LLVM */
+#endif /* defined R600_USE_LLVM || defined HAVE_OPENCL */
 
 #endif /* R600_LLVM_H */
index cb13ca767c9a0d77390eb0184302a4c89069926c..e0ee823ce39cd06ecbcd0dad0fcc3a8f07478200 100644 (file)
@@ -382,6 +382,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
        case PIPE_CAP_USER_INDEX_BUFFERS:
        case PIPE_CAP_USER_CONSTANT_BUFFERS:
+       case PIPE_CAP_COMPUTE:
                return 1;
 
        case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
@@ -409,7 +410,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
        case PIPE_CAP_VERTEX_COLOR_CLAMPED:
        case PIPE_CAP_USER_VERTEX_BUFFERS:
-       case PIPE_CAP_COMPUTE:
                return 0;
 
        /* Stream output. */
@@ -491,6 +491,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
        {
        case PIPE_SHADER_FRAGMENT:
        case PIPE_SHADER_VERTEX:
+        case PIPE_SHADER_COMPUTE:
                break;
        case PIPE_SHADER_GEOMETRY:
                /* XXX: support and enable geometry programs */
@@ -538,8 +539,12 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
                return rscreen->glsl_feature_level >= 130;
        case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
                return 16;
-       case PIPE_SHADER_CAP_PREFERRED_IR:
-               return PIPE_SHADER_IR_TGSI;
+        case PIPE_SHADER_CAP_PREFERRED_IR:
+               if (shader == PIPE_SHADER_COMPUTE) {
+                       return PIPE_SHADER_IR_LLVM;
+               } else {
+                       return PIPE_SHADER_IR_TGSI;
+               }
        }
        return 0;
 }
@@ -569,6 +574,81 @@ static int r600_get_video_param(struct pipe_screen *screen,
        }
 }
 
+static int r600_get_compute_param(struct pipe_screen *screen,
+        enum pipe_compute_cap param,
+        void *ret)
+{
+       //TODO: select these params by asic
+       switch (param) {
+       case PIPE_COMPUTE_CAP_IR_TARGET:
+               if (ret) {
+                       strcpy(ret, "r600--");
+               }
+               return 7 * sizeof(char);
+
+       case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+               if (ret) {
+                       uint64_t * grid_dimension = ret;
+                       grid_dimension[0] = 3;
+               }
+               return 1 * sizeof(uint64_t);
+
+       case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+               if (ret) {
+                       uint64_t * grid_size = ret;
+                       grid_size[0] = 65535;
+                       grid_size[1] = 65535;
+                       grid_size[2] = 1;
+               }
+               return 3 * sizeof(uint64_t) ;
+
+       case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+               if (ret) {
+                       uint64_t * block_size = ret;
+                       block_size[0] = 256;
+                       block_size[1] = 256;
+                       block_size[2] = 256;
+               }
+               return 3 * sizeof(uint64_t);
+
+       case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+               if (ret) {
+                       uint64_t * max_threads_per_block = ret;
+                       *max_threads_per_block = 256;
+               }
+               return sizeof(uint64_t);
+
+       case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+               if (ret) {
+                       uint64_t * max_global_size = ret;
+                       /* XXX: This is what the proprietary driver reports, we
+                        * may want to use a different value. */
+                       *max_global_size = 201326592;
+               }
+               return sizeof(uint64_t);
+
+       case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+               if (ret) {
+                       uint64_t * max_input_size = ret;
+                       *max_input_size = 1024;
+               }
+               return sizeof(uint64_t);
+
+       case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+               if (ret) {
+                       uint64_t * max_local_size = ret;
+                       /* XXX: This is what the proprietary driver reports, we
+                        * may want to use a different value. */
+                       *max_local_size = 32768;
+               }
+               return sizeof(uint64_t);
+
+       default:
+               fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
+               return 0;
+       }
+}
+
 static void r600_destroy_screen(struct pipe_screen* pscreen)
 {
        struct r600_screen *rscreen = (struct r600_screen *)pscreen;
@@ -576,6 +656,10 @@ static void r600_destroy_screen(struct pipe_screen* pscreen)
        if (rscreen == NULL)
                return;
 
+       if (rscreen->global_pool) {
+               compute_memory_pool_delete(rscreen->global_pool);
+       }
+
        if (rscreen->fences.bo) {
                struct r600_fence_block *entry, *tmp;
 
@@ -833,6 +917,8 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
        rscreen->screen.get_shader_param = r600_get_shader_param;
        rscreen->screen.get_paramf = r600_get_paramf;
        rscreen->screen.get_video_param = r600_get_video_param;
+       rscreen->screen.get_compute_param = r600_get_compute_param;
+
        if (rscreen->chip_class >= EVERGREEN) {
                rscreen->screen.is_format_supported = evergreen_is_format_supported;
        } else {
@@ -857,5 +943,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
        rscreen->use_surface_alloc = debug_get_bool_option("R600_SURF", TRUE);
        rscreen->glsl_feature_level = debug_get_bool_option("R600_GLSL130", TRUE) ? 130 : 120;
 
+       rscreen->global_pool = compute_memory_pool_new(1024*16, rscreen);
+
        return &rscreen->screen;
 }
index e5ba49c5ac5ffacf7be90a24cab1b6fce5b439d2..f2865d2a22ef2f9d6d6f9707b7df6e958a562977 100644 (file)
 
 #include "util/u_slab.h"
 #include "r600.h"
+#include "r600_llvm.h"
+#include "r600_public.h"
 #include "r600_shader.h"
 #include "r600_resource.h"
+#include "evergreen_compute.h"
 
 #define R600_MAX_CONST_BUFFERS 2
 #define R600_MAX_CONST_BUFFER_SIZE 4096
@@ -98,9 +101,16 @@ enum r600_pipe_state_id {
        R600_PIPE_STATE_RESOURCE,
        R600_PIPE_STATE_POLYGON_OFFSET,
        R600_PIPE_STATE_FETCH_SHADER,
+       R600_PIPE_STATE_SPI,
        R600_PIPE_NSTATES
 };
 
+struct compute_memory_pool;
+void compute_memory_pool_delete(struct compute_memory_pool* pool);
+struct compute_memory_pool* compute_memory_pool_new(
+       int64_t initial_size_in_dw,
+       struct r600_screen *rscreen);
+
 struct r600_pipe_fences {
        struct r600_resource            *bo;
        unsigned                        *data;
@@ -123,6 +133,12 @@ struct r600_screen {
 
        bool                            use_surface_alloc;
        int                             glsl_feature_level;
+
+       /*for compute global memory binding, we allocate stuff here, instead of
+        * buffers.
+        * XXX: Not sure if this is the best place for global_pool.  Also,
+        * it's not thread safe, so it won't work with multiple contexts. */
+       struct compute_memory_pool *global_pool;
 };
 
 struct r600_pipe_sampler_view {
@@ -257,6 +273,7 @@ struct r600_context {
        struct pipe_clip_state          clip;
        struct r600_pipe_shader         *ps_shader;
        struct r600_pipe_shader         *vs_shader;
+       struct r600_pipe_compute        *cs_shader;
        struct r600_pipe_rasterizer     *rasterizer;
        struct r600_pipe_state          vgt;
        struct r600_pipe_state          spi;
@@ -266,7 +283,9 @@ struct r600_context {
        unsigned                        saved_render_cond_mode;
        /* shader information */
        boolean                         two_side;
+       boolean                         spi_dirty;
        unsigned                        sprite_coord_enable;
+       boolean                         flatshade;
        boolean                         export_16bpc;
        unsigned                        alpha_ref;
        boolean                         alpha_ref_dirty;
@@ -412,6 +431,10 @@ void r600_init_context_resource_functions(struct r600_context *r600);
 
 /* r600_shader.c */
 int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader);
+#ifdef HAVE_OPENCL
+int r600_compute_shader_create(struct pipe_context * ctx,
+       LLVMModuleRef mod,  struct r600_bytecode * bytecode);
+#endif
 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader);
 int r600_find_vs_semantic_index(struct r600_shader *vs,
                                struct r600_shader *ps, int id);
index ef0b4ceffd0e17859a85d1ae1bef92bd9c420017..0c14a2dc6bc95adfe1e0c433b0aaac164baa0932 100644 (file)
@@ -27,7 +27,12 @@ static struct pipe_resource *r600_resource_create(struct pipe_screen *screen,
                                                const struct pipe_resource *templ)
 {
        if (templ->target == PIPE_BUFFER) {
-               return r600_buffer_create(screen, templ);
+               if (templ->bind & PIPE_BIND_GLOBAL) {
+                   return r600_compute_global_buffer_create(screen, templ);
+               }
+               else {
+                   return r600_buffer_create(screen, templ);
+               }
        } else {
                return r600_texture_create(screen, templ);
        }
@@ -44,12 +49,21 @@ static struct pipe_resource *r600_resource_from_handle(struct pipe_screen * scre
        }
 }
 
+void r600_resource_destroy(struct pipe_screen *screen, struct pipe_resource *res)
+{
+       if (res->target == PIPE_BUFFER && (res->bind & PIPE_BIND_GLOBAL)) {
+               r600_compute_global_buffer_destroy(screen, res);
+       } else {
+               u_resource_destroy_vtbl(screen, res);
+       }
+}
+
 void r600_init_screen_resource_functions(struct pipe_screen *screen)
 {
        screen->resource_create = r600_resource_create;
        screen->resource_from_handle = r600_resource_from_handle;
        screen->resource_get_handle = u_resource_get_handle_vtbl;
-       screen->resource_destroy = u_resource_destroy_vtbl;
+       screen->resource_destroy = r600_resource_destroy;
 }
 
 void r600_init_context_resource_functions(struct r600_context *r600)
index 87bef7306548e2464c462271b279e800a647e45e..d401e40c5ba80b8e3f0b9ef577b7cc7a885e79d4 100644 (file)
@@ -34,6 +34,13 @@ struct r600_transfer {
        unsigned                        offset;
 };
 
+struct compute_memory_item;
+
+struct r600_resource_global {
+       struct r600_resource base;
+       struct compute_memory_item *chunk;
+};
+
 struct r600_resource_texture {
        struct r600_resource            resource;
 
@@ -65,6 +72,7 @@ struct r600_surface {
        unsigned                        aligned_height;
 };
 
+void r600_resource_destroy(struct pipe_screen *screen, struct pipe_resource *res);
 void r600_init_screen_resource_functions(struct pipe_screen *screen);
 
 /* r600_texture */
index dc208b923cb1b541fc1e1cee1bf9325972b02913..5f3c76eafbb34cd965322c6cd2c04aed5bf6b04a 100644 (file)
@@ -225,6 +225,37 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
  * struct r600_bytecode.
  */
 
+static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
+                               unsigned char * bytes,  unsigned num_bytes);
+
+#ifdef HAVE_OPENCL
+int r600_compute_shader_create(struct pipe_context * ctx,
+       LLVMModuleRef mod,  struct r600_bytecode * bytecode)
+{
+       struct r600_context *r600_ctx = (struct r600_context *)ctx;
+       unsigned char * bytes;
+       unsigned byte_count;
+       struct r600_shader_ctx shader_ctx;
+       unsigned dump = 0;
+
+       if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
+               dump = 1;
+       }
+
+       r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
+       shader_ctx.bc = bytecode;
+       r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
+       shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
+       r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
+       r600_bytecode_build(shader_ctx.bc);
+       if (dump) {
+               r600_bytecode_dump(shader_ctx.bc);
+       }
+       return 1;
+}
+
+#endif /* HAVE_OPENCL */
+
 static unsigned r600_src_from_byte_stream(unsigned char * bytes,
                unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
 {
index d6f85c38c321431280449564298ffa511b1ea965..5b159908adbfd2de7fe49b78e30e6d01870bd775 100644 (file)
@@ -916,6 +916,10 @@ void* r600_texture_transfer_map(struct pipe_context *ctx,
        unsigned offset = 0;
        char *map;
 
+       if ((transfer->resource->bind & PIPE_BIND_GLOBAL) && transfer->resource->target == PIPE_BUFFER) {
+               return r600_compute_global_transfer_map(ctx, transfer);
+       }
+
        if (rtransfer->staging) {
                buf = ((struct r600_resource *)rtransfer->staging)->cs_buf;
        } else {
@@ -945,6 +949,10 @@ void r600_texture_transfer_unmap(struct pipe_context *ctx,
        struct r600_context *rctx = (struct r600_context*)ctx;
        struct radeon_winsys_cs_handle *buf;
 
+       if ((transfer->resource->bind & PIPE_BIND_GLOBAL) && transfer->resource->target == PIPE_BUFFER) {
+               return r600_compute_global_transfer_unmap(ctx, transfer);
+       }
+
        if (rtransfer->staging) {
                buf = ((struct r600_resource *)rtransfer->staging)->cs_buf;
        } else {