#include "pipe/p_state.h"
#include "pipe/p_context.h"
#include "util/u_blitter.h"
-#include "util/u_double_list.h"
+#include "util/list.h"
#include "util/u_transfer.h"
#include "util/u_surface.h"
#include "util/u_pack_color.h"
#include "util/u_framebuffer.h"
#include "pipebuffer/pb_buffer.h"
#include "evergreend.h"
-#include "r600_resource.h"
#include "r600_shader.h"
#include "r600_pipe.h"
#include "r600_formats.h"
#include "compute_memory_pool.h"
#include "sb/sb_public.h"
#ifdef HAVE_OPENCL
-#include "radeon_llvm_util.h"
+#include "radeon/radeon_llvm_util.h"
#endif
+#include "radeon/radeon_elf_util.h"
+#include <inttypes.h>
/**
RAT0 is for global binding write
static void evergreen_set_rat(
struct r600_pipe_compute *pipe,
- int id,
+ unsigned id,
struct r600_resource* bo,
int start,
int size)
{
struct r600_context *ctx = (struct r600_context *)ctx_;
struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
-
#ifdef HAVE_OPENCL
const struct pipe_llvm_program_header * header;
- const unsigned char * code;
- unsigned i;
-
- shader->llvm_ctx = LLVMContextCreate();
+ const char *code;
+ void *p;
+ boolean use_kill;
COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
-
header = cso->prog;
code = cso->prog + sizeof(struct pipe_llvm_program_header);
+#if HAVE_LLVM < 0x0306
+ (void)use_kill;
+ (void)p;
+ shader->llvm_ctx = LLVMContextCreate();
+ shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
+ code, header->num_bytes);
+ shader->kernels = CALLOC(sizeof(struct r600_kernel),
+ shader->num_kernels);
+ {
+ unsigned i;
+ for (i = 0; i < shader->num_kernels; i++) {
+ struct r600_kernel *kernel = &shader->kernels[i];
+ kernel->llvm_module = radeon_llvm_get_kernel_module(
+ shader->llvm_ctx, i, code, header->num_bytes);
+ }
+ }
+#else
+ memset(&shader->binary, 0, sizeof(shader->binary));
+ radeon_elf_read(code, header->num_bytes, &shader->binary);
+ r600_create_shader(&shader->bc, &shader->binary, &use_kill);
+
+ shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
+ shader->bc.ndw * 4);
+ p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
+ memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
+ ctx->b.ws->buffer_unmap(shader->code_bo->cs_buf);
+#endif
#endif
shader->ctx = (struct r600_context*)ctx;
shader->private_size = cso->req_private_mem;
shader->input_size = cso->req_input_mem;
-#ifdef HAVE_OPENCL
- shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx, code,
- header->num_bytes);
- shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
-
- for (i = 0; i < shader->num_kernels; i++) {
- struct r600_kernel *kernel = &shader->kernels[i];
- kernel->llvm_module = radeon_llvm_get_kernel_module(shader->llvm_ctx, i,
- code, header->num_bytes);
- }
-#endif
return shader;
}
if (!shader)
return;
- FREE(shader->kernels);
-
-#ifdef HAVE_OPENCL
- if (shader->llvm_ctx){
- LLVMContextDispose(shader->llvm_ctx);
- }
-#endif
-
FREE(shader);
}
{
struct r600_context *ctx = (struct r600_context *)ctx_;
struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
- int i;
+ unsigned i;
/* We need to reserve 9 dwords (36 bytes) for implicit kernel
* parameters.
*/
memcpy(kernel_parameters_start, input, shader->input_size);
for (i = 0; i < (input_size / 4); i++) {
- COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
+ COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
((unsigned*)num_work_groups_start)[i]);
}
unsigned wave_divisor = (16 * num_pipes);
int group_size = 1;
int grid_size = 1;
- unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;
+ unsigned lds_size = shader->local_size / 4 +
+#if HAVE_LLVM < 0x0306
+ shader->active_kernel->bc.nlds_dw;
+#else
+ shader->bc.nlds_dw;
+#endif
+
/* Calculate group_size/grid_size */
for (i = 0; i < 3; i++) {
const uint *grid_layout)
{
struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
- int i;
+ unsigned i;
/* make sure that the gfx ring is only one active */
- if (ctx->b.rings.dma.cs) {
- ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
+ if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
+ ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
}
/* Initialize all the compute-related registers.
struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
(struct r600_resource*)cb->base.texture,
- RADEON_USAGE_READWRITE);
+ RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SHADER_RESOURCE_RW);
r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
ctx->b.flags = 0;
if (ctx->b.chip_class >= CAYMAN) {
- ctx->skip_surface_sync_on_next_cs_flush = true;
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+ /* DEALLOC_STATE prevents the GPU from hanging when a
+ * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
+ * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
+ */
+ cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
+ cs->buf[cs->cdw++] = 0;
}
#if 0
struct r600_cs_shader_state *state =
(struct r600_cs_shader_state*)atom;
struct r600_pipe_compute *shader = state->shader;
- struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
uint64_t va;
+ struct r600_resource *code_bo;
+ unsigned ngpr, nstack;
- va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b);
+#if HAVE_LLVM < 0x0306
+ struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
+ code_bo = kernel->code_bo;
+ va = kernel->code_bo->gpu_address;
+ ngpr = kernel->bc.ngpr;
+ nstack = kernel->bc.nstack;
+#else
+ code_bo = shader->code_bo;
+ va = shader->code_bo->gpu_address + state->pc;
+ ngpr = shader->bc.ngpr;
+ nstack = shader->bc.nstack;
+#endif
r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
- S_0288D4_NUM_GPRS(kernel->bc.ngpr)
- | S_0288D4_STACK_SIZE(kernel->bc.nstack));
+ S_0288D4_NUM_GPRS(ngpr)
+ | S_0288D4_STACK_SIZE(nstack));
radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
- kernel->code_bo, RADEON_USAGE_READ));
+ code_bo, RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_DATA));
}
static void evergreen_launch_grid(
uint32_t pc, const void *input)
{
struct r600_context *ctx = (struct r600_context *)ctx_;
-
+#ifdef HAVE_OPENCL
struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
+ boolean use_kill;
+
+#if HAVE_LLVM < 0x0306
struct r600_kernel *kernel = &shader->kernels[pc];
+ (void)use_kill;
+ if (!kernel->code_bo) {
+ void *p;
+ struct r600_bytecode *bc = &kernel->bc;
+ LLVMModuleRef mod = kernel->llvm_module;
+ boolean use_kill = false;
+ bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
+ unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
+ unsigned sb_disasm = use_sb ||
+ (ctx->screen->b.debug_flags & DBG_SB_DISASM);
+
+ r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
+ ctx->screen->has_compressed_msaa_texturing);
+ bc->type = TGSI_PROCESSOR_COMPUTE;
+ bc->isa = ctx->isa;
+ r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
+
+ if (dump && !sb_disasm) {
+ r600_bytecode_disasm(bc);
+ } else if ((dump && sb_disasm) || use_sb) {
+ if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
+ R600_ERR("r600_sb_bytecode_process failed!\n");
+ }
+
+ kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
+ kernel->bc.ndw * 4);
+ p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
+ memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
+ ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
+ }
+ shader->active_kernel = kernel;
+ ctx->cs_shader_state.kernel_index = pc;
+#else
+ ctx->cs_shader_state.pc = pc;
+ /* Get the config information for this kernel. */
+ r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
+#endif
+#endif
COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
-#ifdef HAVE_OPENCL
- if (!kernel->code_bo) {
- void *p;
- struct r600_bytecode *bc = &kernel->bc;
- LLVMModuleRef mod = kernel->llvm_module;
- boolean use_kill = false;
- bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
- unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
- unsigned sb_disasm = use_sb ||
- (ctx->screen->b.debug_flags & DBG_SB_DISASM);
-
- r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
- ctx->screen->has_compressed_msaa_texturing);
- bc->type = TGSI_PROCESSOR_COMPUTE;
- bc->isa = ctx->isa;
- r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
-
- if (dump && !sb_disasm) {
- r600_bytecode_disasm(bc);
- } else if ((dump && sb_disasm) || use_sb) {
- if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
- R600_ERR("r600_sb_bytecode_process failed!\n");
- }
-
- kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
- kernel->bc.ndw * 4);
- p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
- memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
- ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
- }
-#endif
- shader->active_kernel = kernel;
- ctx->cs_shader_state.kernel_index = pc;
evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
compute_emit_cs(ctx, block_layout, grid_layout);
}
COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
start, count);
- for (int i = 0; i < count; i++) {
+ for (unsigned i = 0; i < count; i++) {
/* The First two vertex buffers are reserved for parameters and
* global buffers. */
unsigned vtx_id = 2 + i;
struct r600_pipe_sampler_view **resource =
(struct r600_pipe_sampler_view **)views;
- for (int i = 0; i < count; i++) {
+ for (unsigned i = 0; i < count; i++) {
if (resource[i]) {
assert(i+1 < 12);
/* XXX: Implement */
struct compute_memory_pool *pool = ctx->screen->global_pool;
struct r600_resource_global **buffers =
(struct r600_resource_global **)resources;
+ unsigned i;
COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
first, n);
return;
}
- compute_memory_finalize_pending(pool, ctx_);
+ /* We mark these items for promotion to the pool if they
+ * aren't already there */
+ for (i = first; i < first + n; i++) {
+ struct compute_memory_item *item = buffers[i]->chunk;
- for (int i = 0; i < n; i++)
+ if (!is_item_in_pool(item))
+ buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
+ }
+
+ if (compute_memory_finalize_pending(pool, ctx_) == -1) {
+ /* XXX: Unset */
+ return;
+ }
+
+ for (i = first; i < first + n; i++)
{
+ uint32_t buffer_offset;
+ uint32_t handle;
assert(resources[i]->target == PIPE_BUFFER);
assert(resources[i]->bind & PIPE_BIND_GLOBAL);
- *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
+ buffer_offset = util_le32_to_cpu(*(handles[i]));
+ handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
+
+ *(handles[i]) = util_cpu_to_le32(handle);
}
evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
ctx->b.b.set_global_binding = evergreen_set_global_binding;
ctx->b.b.launch_grid = evergreen_launch_grid;
- /* We always use at least one vertex buffer for parameters (id = 1)*/
- ctx->cs_vertex_buffer_state.enabled_mask =
- ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
}
struct pipe_resource *r600_compute_global_buffer_create(
struct r600_resource_global* buffer =
(struct r600_resource_global*)resource;
+ struct compute_memory_item *item = buffer->chunk;
+ struct pipe_resource *dst = NULL;
+ unsigned offset = box->x;
+
+ if (is_item_in_pool(item)) {
+ compute_memory_demote_item(pool, item, ctx_);
+ }
+ else {
+ if (item->real_buffer == NULL) {
+ item->real_buffer = (struct r600_resource*)
+ r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
+ }
+ }
+
+ dst = (struct pipe_resource*)item->real_buffer;
+
+ if (usage & PIPE_TRANSFER_READ)
+ buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
+
COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
"width = %u, height = %u, depth = %u)\n", level, usage,
box->x, box->y, box->z, box->width, box->height,
box->depth);
- COMPUTE_DBG(rctx->screen, "Buffer id = %u offset = "
- "%u (box.x)\n", buffer->chunk->id, box->x);
-
+ COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
+ "%u (box.x)\n", item->id, box->x);
- compute_memory_finalize_pending(pool, ctx_);
assert(resource->target == PIPE_BUFFER);
assert(resource->bind & PIPE_BIND_GLOBAL);
assert(box->z == 0);
///TODO: do it better, mapping is not possible if the pool is too big
- return pipe_buffer_map_range(ctx_, (struct pipe_resource*)buffer->chunk->pool->bo,
- box->x + (buffer->chunk->start_in_dw * 4),
- box->width, usage, ptransfer);
+ return pipe_buffer_map_range(ctx_, dst,
+ offset, box->width, usage, ptransfer);
}
void r600_compute_global_transfer_unmap(