2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 * Adam Rak <adam.rak@streamnovation.com>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/u_double_list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
42 #include "evergreend.h"
43 #include "r600_resource.h"
44 #include "r600_shader.h"
45 #include "r600_pipe.h"
46 #include "r600_formats.h"
47 #include "evergreen_compute.h"
48 #include "r600_hw_context_priv.h"
49 #include "evergreen_compute_internal.h"
50 #include "compute_memory_pool.h"
52 #include "llvm_wrapper.h"
56 RAT0 is for global binding write
57 VTX1 is for global binding read
59 for wrting images RAT1...
60 for reading images TEX2...
63 TEX2... consumes the same fetch resources, that VTX2... would consume
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
79 so 10 for writing is enough. 176 is the max for reading according to the docs
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
86 static void evergreen_cs_set_vertex_buffer(
87 struct r600_context
* rctx
,
90 struct pipe_resource
* buffer
)
92 struct pipe_vertex_buffer
*vb
= &rctx
->cs_vertex_buffer
[vb_index
];
93 struct r600_vertexbuf_state
* state
= &rctx
->cs_vertex_buffer_state
;
95 vb
->buffer_offset
= offset
;
97 vb
->user_buffer
= NULL
;
99 r600_inval_vertex_cache(rctx
);
100 state
->dirty_mask
|= 1 << vb_index
;
101 r600_atom_dirty(rctx
, &state
->atom
);
104 const struct u_resource_vtbl r600_global_buffer_vtbl
=
106 u_default_resource_get_handle
, /* get_handle */
107 r600_compute_global_buffer_destroy
, /* resource_destroy */
108 r600_compute_global_get_transfer
, /* get_transfer */
109 r600_compute_global_transfer_destroy
, /* transfer_destroy */
110 r600_compute_global_transfer_map
, /* transfer_map */
111 r600_compute_global_transfer_flush_region
,/* transfer_flush_region */
112 r600_compute_global_transfer_unmap
, /* transfer_unmap */
113 r600_compute_global_transfer_inline_write
/* transfer_inline_write */
117 void *evergreen_create_compute_state(
118 struct pipe_context
*ctx_
,
119 const const struct pipe_compute_state
*cso
)
121 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
124 const struct pipe_llvm_program_header
* header
;
125 const unsigned char * code
;
127 COMPUTE_DBG("*** evergreen_create_compute_state\n");
130 code
= cso
->prog
+ sizeof(struct pipe_llvm_program_header
);
133 if (!ctx
->screen
->screen
.get_param(&ctx
->screen
->screen
,
135 fprintf(stderr
, "Compute is not supported\n");
138 struct r600_pipe_compute
*shader
= CALLOC_STRUCT(r600_pipe_compute
);
140 shader
->ctx
= (struct r600_context
*)ctx
;
141 shader
->resources
= (struct evergreen_compute_resource
*)
142 CALLOC(sizeof(struct evergreen_compute_resource
),
143 get_compute_resource_num());
144 shader
->local_size
= cso
->req_local_mem
; ///TODO: assert it
145 shader
->private_size
= cso
->req_private_mem
;
146 shader
->input_size
= cso
->req_input_mem
;
149 shader
->mod
= llvm_parse_bitcode(code
, header
->num_bytes
);
151 r600_compute_shader_create(ctx_
, shader
->mod
, &shader
->bc
);
156 void evergreen_delete_compute_state(struct pipe_context
*ctx
, void* state
)
158 struct r600_pipe_compute
*shader
= (struct r600_pipe_compute
*)state
;
160 free(shader
->resources
);
164 static void evergreen_bind_compute_state(struct pipe_context
*ctx_
, void *state
)
166 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
168 COMPUTE_DBG("*** evergreen_bind_compute_state\n");
170 ctx
->cs_shader
= (struct r600_pipe_compute
*)state
;
172 if (!ctx
->cs_shader
->shader_code_bo
) {
174 ctx
->cs_shader
->shader_code_bo
=
175 r600_compute_buffer_alloc_vram(ctx
->screen
,
176 ctx
->cs_shader
->bc
.ndw
* 4);
178 void *p
= ctx
->ws
->buffer_map(
179 ctx
->cs_shader
->shader_code_bo
->cs_buf
,
180 ctx
->cs
, PIPE_TRANSFER_WRITE
);
182 memcpy(p
, ctx
->cs_shader
->bc
.bytecode
, ctx
->cs_shader
->bc
.ndw
* 4);
184 ctx
->ws
->buffer_unmap(ctx
->cs_shader
->shader_code_bo
->cs_buf
);
188 struct evergreen_compute_resource
* res
= get_empty_res(ctx
->cs_shader
,
189 COMPUTE_RESOURCE_SHADER
, 0);
191 if (ctx
->chip_class
< CAYMAN
) {
192 evergreen_reg_set(res
, R_008C0C_SQ_GPR_RESOURCE_MGMT_3
,
193 S_008C0C_NUM_LS_GPRS(ctx
->cs_shader
->bc
.ngpr
));
196 ///maybe we can use it later
197 evergreen_reg_set(res
, R_0286C8_SPI_THREAD_GROUPING
, 0);
198 ///maybe we can use it later
199 evergreen_reg_set(res
, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2
, 0);
201 evergreen_reg_set(res
, R_0288D4_SQ_PGM_RESOURCES_LS
,
202 S_0288D4_NUM_GPRS(ctx
->cs_shader
->bc
.ngpr
)
203 | S_0288D4_STACK_SIZE(ctx
->cs_shader
->bc
.nstack
));
204 evergreen_reg_set(res
, R_0288D8_SQ_PGM_RESOURCES_LS_2
, 0);
206 evergreen_reg_set(res
, R_0288D0_SQ_PGM_START_LS
, 0);
207 res
->bo
= ctx
->cs_shader
->shader_code_bo
;
208 res
->usage
= RADEON_USAGE_READ
;
209 res
->coher_bo_size
= ctx
->cs_shader
->bc
.ndw
*4;
211 r600_inval_shader_cache(ctx
);
215 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
216 * kernel parameters there are inplicit parameters that need to be stored
217 * in the vertex buffer as well. Here is how these parameters are organized in
220 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
221 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
222 * DWORDS 6-8: Number of work items within each work group in each dimension
224 * DWORDS 9+ : Kernel parameters
226 void evergreen_compute_upload_input(
227 struct pipe_context
*ctx_
,
228 const uint
*block_layout
,
229 const uint
*grid_layout
,
232 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
234 unsigned kernel_parameters_offset_bytes
= 36;
235 uint32_t * num_work_groups_start
;
236 uint32_t * global_size_start
;
237 uint32_t * local_size_start
;
238 uint32_t * kernel_parameters_start
;
240 if (ctx
->cs_shader
->input_size
== 0) {
244 if (!ctx
->cs_shader
->kernel_param
) {
245 unsigned buffer_size
= ctx
->cs_shader
->input_size
;
247 /* Add space for the grid dimensions */
248 buffer_size
+= kernel_parameters_offset_bytes
* sizeof(uint
);
249 ctx
->cs_shader
->kernel_param
=
250 r600_compute_buffer_alloc_vram(ctx
->screen
,
254 num_work_groups_start
= ctx
->ws
->buffer_map(
255 ctx
->cs_shader
->kernel_param
->cs_buf
,
256 ctx
->cs
, PIPE_TRANSFER_WRITE
);
257 global_size_start
= num_work_groups_start
+ (3 * (sizeof(uint
) /4));
258 local_size_start
= global_size_start
+ (3 * (sizeof(uint
)) / 4);
259 kernel_parameters_start
= local_size_start
+ (3 * (sizeof(uint
)) / 4);
261 /* Copy the work group size */
262 memcpy(num_work_groups_start
, grid_layout
, 3 * sizeof(uint
));
264 /* Copy the global size */
265 for (i
= 0; i
< 3; i
++) {
266 global_size_start
[i
] = grid_layout
[i
] * block_layout
[i
];
269 /* Copy the local dimensions */
270 memcpy(local_size_start
, block_layout
, 3 * sizeof(uint
));
272 /* Copy the kernel inputs */
273 memcpy(kernel_parameters_start
, input
, ctx
->cs_shader
->input_size
);
275 for (i
= 0; i
< (kernel_parameters_offset_bytes
/ 4) +
276 (ctx
->cs_shader
->input_size
/ 4); i
++) {
277 COMPUTE_DBG("input %i : %i\n", i
,
278 ((unsigned*)num_work_groups_start
)[i
]);
281 ctx
->ws
->buffer_unmap(ctx
->cs_shader
->kernel_param
->cs_buf
);
283 ///ID=0 is reserved for the parameters
284 evergreen_cs_set_vertex_buffer(ctx
, 0, 0,
285 (struct pipe_resource
*)ctx
->cs_shader
->kernel_param
);
286 ///ID=0 is reserved for parameters
287 evergreen_set_const_cache(ctx
->cs_shader
, 0,
288 ctx
->cs_shader
->kernel_param
, ctx
->cs_shader
->input_size
, 0);
291 void evergreen_direct_dispatch(
292 struct pipe_context
*ctx_
,
293 const uint
*block_layout
, const uint
*grid_layout
)
295 /* This struct r600_context* must be called rctx, because the
296 * r600_pipe_state_add_reg macro assumes there is a local variable
297 * of type struct r600_context* called rctx.
299 struct r600_context
*rctx
= (struct r600_context
*)ctx_
;
303 struct evergreen_compute_resource
* res
= get_empty_res(rctx
->cs_shader
,
304 COMPUTE_RESOURCE_DISPATCH
, 0);
306 /* Set CB_TARGET_MASK */
307 evergreen_reg_set(res
, R_028238_CB_TARGET_MASK
, rctx
->compute_cb_target_mask
);
309 evergreen_reg_set(res
, R_008958_VGT_PRIMITIVE_TYPE
, V_008958_DI_PT_POINTLIST
);
311 evergreen_reg_set(res
, R_00899C_VGT_COMPUTE_START_X
, 0);
312 evergreen_reg_set(res
, R_0089A0_VGT_COMPUTE_START_Y
, 0);
313 evergreen_reg_set(res
, R_0089A4_VGT_COMPUTE_START_Z
, 0);
315 evergreen_reg_set(res
, R_0286EC_SPI_COMPUTE_NUM_THREAD_X
, block_layout
[0]);
316 evergreen_reg_set(res
, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y
, block_layout
[1]);
317 evergreen_reg_set(res
, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z
, block_layout
[2]);
323 for (i
= 0; i
< 3; i
++) {
324 group_size
*= block_layout
[i
];
327 for (i
= 0; i
< 3; i
++) {
328 grid_size
*= grid_layout
[i
];
331 evergreen_reg_set(res
, R_008970_VGT_NUM_INDICES
, group_size
);
332 evergreen_reg_set(res
, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE
, group_size
);
334 evergreen_emit_raw_value(res
, PKT3C(PKT3_DISPATCH_DIRECT
, 3, 0));
335 evergreen_emit_raw_value(res
, grid_layout
[0]);
336 evergreen_emit_raw_value(res
, grid_layout
[1]);
337 evergreen_emit_raw_value(res
, grid_layout
[2]);
338 ///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN
339 evergreen_emit_raw_value(res
, 1);
342 static void compute_emit_cs(struct r600_context
*ctx
)
344 struct radeon_winsys_cs
*cs
= ctx
->cs
;
347 struct r600_resource
*onebo
= NULL
;
348 struct r600_pipe_state
*cb_state
;
350 /* Initialize all the registers common to both 3D and compute. Some
351 * 3D only register will be initialized by this atom as well, but
352 * this is OK for now.
354 * See evergreen_init_atom_start_cs() or cayman_init_atom_start_cs() in
355 * evergreen_state.c for the list of registers that are intialized by
356 * the start_cs_cmd atom.
358 r600_emit_atom(ctx
, &ctx
->start_cs_cmd
.atom
);
360 /* Initialize all the compute specific registers.
362 * See evergreen_init_atom_start_compute_cs() in this file for the list
363 * of registers initialized by the start_compuet_cs_cmd atom.
365 r600_emit_atom(ctx
, &ctx
->start_compute_cs_cmd
.atom
);
368 cb_state
= ctx
->states
[R600_PIPE_STATE_FRAMEBUFFER
];
369 r600_context_pipe_state_emit(ctx
, cb_state
, RADEON_CP_PACKET3_COMPUTE_MODE
);
371 /* Emit vertex buffer state */
372 ctx
->cs_vertex_buffer_state
.atom
.num_dw
= 12 * ctx
->nr_cs_vertex_buffers
;
373 r600_emit_atom(ctx
, &ctx
->cs_vertex_buffer_state
.atom
);
375 for (i
= 0; i
< get_compute_resource_num(); i
++) {
376 if (ctx
->cs_shader
->resources
[i
].enabled
) {
378 COMPUTE_DBG("resnum: %i, cdw: %i\n", i
, cs
->cdw
);
380 for (j
= 0; j
< ctx
->cs_shader
->resources
[i
].cs_end
; j
++) {
381 if (ctx
->cs_shader
->resources
[i
].do_reloc
[j
]) {
382 assert(ctx
->cs_shader
->resources
[i
].bo
);
383 evergreen_emit_ctx_reloc(ctx
,
384 ctx
->cs_shader
->resources
[i
].bo
,
385 ctx
->cs_shader
->resources
[i
].usage
);
388 cs
->buf
[cs
->cdw
++] = ctx
->cs_shader
->resources
[i
].cs
[j
];
391 if (ctx
->cs_shader
->resources
[i
].bo
) {
392 onebo
= ctx
->cs_shader
->resources
[i
].bo
;
393 evergreen_emit_ctx_reloc(ctx
,
394 ctx
->cs_shader
->resources
[i
].bo
,
395 ctx
->cs_shader
->resources
[i
].usage
);
397 ///special case for textures
398 if (ctx
->cs_shader
->resources
[i
].do_reloc
399 [ctx
->cs_shader
->resources
[i
].cs_end
] == 2) {
400 evergreen_emit_ctx_reloc(ctx
,
401 ctx
->cs_shader
->resources
[i
].bo
,
402 ctx
->cs_shader
->resources
[i
].usage
);
408 /* r600_flush_framebuffer() updates the cb_flush_flags and then
409 * calls r600_emit_atom() on the ctx->surface_sync_cmd.atom, which emits
410 * a SURFACE_SYNC packet via r600_emit_surface_sync().
412 * XXX r600_emit_surface_sync() hardcodes the CP_COHER_SIZE to
413 * 0xffffffff, so we will need to add a field to struct
414 * r600_surface_sync_cmd if we want to manually set this value.
416 r600_flush_framebuffer(ctx
, true /* Flush now */);
419 COMPUTE_DBG("cdw: %i\n", cs
->cdw
);
420 for (i
= 0; i
< cs
->cdw
; i
++) {
421 COMPUTE_DBG("%4i : 0x%08X\n", i
, ctx
->cs
->buf
[i
]);
425 ctx
->ws
->cs_flush(ctx
->cs
, RADEON_FLUSH_ASYNC
| RADEON_FLUSH_COMPUTE
);
427 ctx
->pm4_dirty_cdwords
= 0;
430 COMPUTE_DBG("shader started\n");
432 ctx
->ws
->buffer_wait(onebo
->buf
, 0);
434 COMPUTE_DBG("...\n");
436 ctx
->streamout_start
= TRUE
;
437 ctx
->streamout_append_bitmask
= ~0;
441 static void evergreen_launch_grid(
442 struct pipe_context
*ctx_
,
443 const uint
*block_layout
, const uint
*grid_layout
,
444 uint32_t pc
, const void *input
)
446 COMPUTE_DBG("PC: %i\n", pc
);
448 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
450 unsigned num_pipes
= ctx
->screen
->info
.r600_max_pipes
;
451 unsigned wave_divisor
= (16 * num_pipes
);
453 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
454 num_waves
= (block_layout
[0] * block_layout
[1] * block_layout
[2] +
455 wave_divisor
- 1) / wave_divisor
;
457 COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
458 num_pipes
, num_waves
);
460 evergreen_set_lds(ctx
->cs_shader
, 0, 0, num_waves
);
461 evergreen_compute_upload_input(ctx_
, block_layout
, grid_layout
, input
);
462 evergreen_direct_dispatch(ctx_
, block_layout
, grid_layout
);
463 compute_emit_cs(ctx
);
466 static void evergreen_set_compute_resources(struct pipe_context
* ctx_
,
467 unsigned start
, unsigned count
,
468 struct pipe_surface
** surfaces
)
470 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
471 struct r600_surface
**resources
= (struct r600_surface
**)surfaces
;
473 COMPUTE_DBG("*** evergreen_set_compute_resources: start = %u count = %u\n",
476 for (int i
= 0; i
< count
; i
++) {
477 /* The First two vertex buffers are reserved for parameters and
479 unsigned vtx_id
= 2 + i
;
481 struct r600_resource_global
*buffer
=
482 (struct r600_resource_global
*)
483 resources
[i
]->base
.texture
;
484 if (resources
[i
]->base
.writable
) {
487 evergreen_set_rat(ctx
->cs_shader
, i
+1,
488 (struct r600_resource
*)resources
[i
]->base
.texture
,
489 buffer
->chunk
->start_in_dw
*4,
490 resources
[i
]->base
.texture
->width0
);
493 evergreen_cs_set_vertex_buffer(ctx
, vtx_id
,
494 buffer
->chunk
->start_in_dw
* 4,
495 resources
[i
]->base
.texture
);
496 ctx
->nr_cs_vertex_buffers
= vtx_id
+ 1;
502 static void evergreen_set_cs_sampler_view(struct pipe_context
*ctx_
,
503 unsigned start_slot
, unsigned count
,
504 struct pipe_sampler_view
**views
)
506 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
507 struct r600_pipe_sampler_view
**resource
=
508 (struct r600_pipe_sampler_view
**)views
;
510 for (int i
= 0; i
< count
; i
++) {
513 ///FETCH0 = VTX0 (param buffer),
514 //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
515 evergreen_set_tex_resource(ctx
->cs_shader
, resource
[i
], i
+2);
520 static void evergreen_bind_compute_sampler_states(
521 struct pipe_context
*ctx_
,
523 unsigned num_samplers
,
526 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
527 struct compute_sampler_state
** samplers
=
528 (struct compute_sampler_state
**)samplers_
;
530 for (int i
= 0; i
< num_samplers
; i
++) {
532 evergreen_set_sampler_resource(ctx
->cs_shader
, samplers
[i
], i
);
537 static void evergreen_set_global_binding(
538 struct pipe_context
*ctx_
, unsigned first
, unsigned n
,
539 struct pipe_resource
**resources
,
542 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
543 struct compute_memory_pool
*pool
= ctx
->screen
->global_pool
;
544 struct r600_resource_global
**buffers
=
545 (struct r600_resource_global
**)resources
;
547 COMPUTE_DBG("*** evergreen_set_global_binding first = %u n = %u\n",
555 compute_memory_finalize_pending(pool
, ctx_
);
557 for (int i
= 0; i
< n
; i
++)
559 assert(resources
[i
]->target
== PIPE_BUFFER
);
560 assert(resources
[i
]->bind
& PIPE_BIND_GLOBAL
);
562 *(handles
[i
]) = buffers
[i
]->chunk
->start_in_dw
* 4;
565 evergreen_set_rat(ctx
->cs_shader
, 0, pool
->bo
, 0, pool
->size_in_dw
* 4);
566 evergreen_cs_set_vertex_buffer(ctx
, 1, 0,
567 (struct pipe_resource
*)pool
->bo
);
571 * This function initializes all the compute specific registers that need to
572 * be initialized for each compute command stream. Registers that are common
573 * to both compute and 3D will be initialized at the beginning of each compute
574 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
575 * packet requires that the shader type bit be set, we must initialize all
576 * context registers needed for compute in this function. The registers
577 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
578 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
581 void evergreen_init_atom_start_compute_cs(struct r600_context
*ctx
)
583 struct r600_command_buffer
*cb
= &ctx
->start_compute_cs_cmd
;
585 int num_stack_entries
;
587 /* We aren't passing the EMIT_EARLY flag as the third argument
588 * because we will be emitting this atom manually in order to
589 * ensure it gets emitted after the start_cs_cmd atom.
591 r600_init_command_buffer(cb
, 256, 0);
592 cb
->pkt_flags
= RADEON_CP_PACKET3_COMPUTE_MODE
;
594 switch (ctx
->family
) {
598 num_stack_entries
= 256;
602 num_stack_entries
= 256;
606 num_stack_entries
= 512;
611 num_stack_entries
= 512;
615 num_stack_entries
= 256;
619 num_stack_entries
= 256;
623 num_stack_entries
= 512;
627 num_stack_entries
= 512;
631 num_stack_entries
= 256;
635 num_stack_entries
= 256;
639 /* Config Registers */
640 if (ctx
->chip_class
< CAYMAN
) {
642 /* These registers control which simds can be used by each stage.
643 * The default for these registers is 0xffffffff, which means
644 * all simds are available for each stage. It's possible we may
645 * want to play around with these in the future, but for now
646 * the default value is fine.
648 * R_008E20_SQ_STATIC_THREAD_MGMT1
649 * R_008E24_SQ_STATIC_THREAD_MGMT2
650 * R_008E28_SQ_STATIC_THREAD_MGMT3
653 /* XXX: We may need to adjust the thread and stack resouce
654 * values for 3D/compute interop */
656 r600_store_config_reg_seq(cb
, R_008C18_SQ_THREAD_RESOURCE_MGMT_1
, 5);
658 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
659 * Set the number of threads used by the PS/VS/GS/ES stage to
662 r600_store_value(cb
, 0);
664 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
665 * Set the number of threads used by the CS (aka LS) stage to
666 * the maximum number of threads and set the number of threads
667 * for the HS stage to 0. */
668 r600_store_value(cb
, S_008C1C_NUM_LS_THREADS(num_threads
));
670 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
671 * Set the Control Flow stack entries to 0 for PS/VS stages */
672 r600_store_value(cb
, 0);
674 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
675 * Set the Control Flow stack entries to 0 for GS/ES stages */
676 r600_store_value(cb
, 0);
678 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
679 * Set the Contol Flow stack entries to 0 for the HS stage, and
680 * set it to the maximum value for the CS (aka LS) stage. */
682 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries
));
685 /* Context Registers */
687 if (ctx
->chip_class
< CAYMAN
) {
688 /* workaround for hw issues with dyn gpr - must set all limits
689 * to 240 instead of 0, 0x1e == 240 / 8
691 r600_store_context_reg(cb
, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1
,
692 S_028838_PS_GPRS(0x1e) |
693 S_028838_VS_GPRS(0x1e) |
694 S_028838_GS_GPRS(0x1e) |
695 S_028838_ES_GPRS(0x1e) |
696 S_028838_HS_GPRS(0x1e) |
697 S_028838_LS_GPRS(0x1e));
700 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
701 r600_store_context_reg(cb
, R_028A40_VGT_GS_MODE
,
702 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
704 r600_store_context_reg(cb
, R_028B54_VGT_SHADER_STAGES_EN
, 2/*CS_ON*/);
706 r600_store_context_reg(cb
, R_0286E8_SPI_COMPUTE_INPUT_CNTL
,
707 S_0286E8_TID_IN_GROUP_ENA
709 | S_0286E8_DISABLE_INDEX_PACK
)
712 /* The LOOP_CONST registers are an optimizations for loops that allows
713 * you to store the initial counter, increment value, and maximum
714 * counter value in a register so that hardware can calculate the
715 * correct number of iterations for the loop, so that you don't need
716 * to have the loop counter in your shader code. We don't currently use
717 * this optimization, so we must keep track of the counter in the
718 * shader and use a break instruction to exit loops. However, the
719 * hardware will still uses this register to determine when to exit a
720 * loop, so we need to initialize the counter to 0, set the increment
721 * value to 1 and the maximum counter value to the 4095 (0xfff) which
722 * is the maximum value allowed. This gives us a maximum of 4096
723 * iterations for our loops, but hopefully our break instruction will
724 * execute before some time before the 4096th iteration.
726 eg_store_loop_const(cb
, R_03A200_SQ_LOOP_CONST_0
+ (160 * 4), 0x1000FFF);
729 void evergreen_init_compute_state_functions(struct r600_context
*ctx
)
731 ctx
->context
.create_compute_state
= evergreen_create_compute_state
;
732 ctx
->context
.delete_compute_state
= evergreen_delete_compute_state
;
733 ctx
->context
.bind_compute_state
= evergreen_bind_compute_state
;
734 // ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
735 ctx
->context
.set_compute_resources
= evergreen_set_compute_resources
;
736 ctx
->context
.set_compute_sampler_views
= evergreen_set_cs_sampler_view
;
737 ctx
->context
.bind_compute_sampler_states
= evergreen_bind_compute_sampler_states
;
738 ctx
->context
.set_global_binding
= evergreen_set_global_binding
;
739 ctx
->context
.launch_grid
= evergreen_launch_grid
;
741 /* We always use at least two vertex buffers for compute, one for
742 * parameters and one for global memory */
743 ctx
->nr_cs_vertex_buffers
= 2;
747 struct pipe_resource
*r600_compute_global_buffer_create(
748 struct pipe_screen
*screen
,
749 const struct pipe_resource
*templ
)
751 assert(templ
->target
== PIPE_BUFFER
);
752 assert(templ
->bind
& PIPE_BIND_GLOBAL
);
753 assert(templ
->array_size
== 1 || templ
->array_size
== 0);
754 assert(templ
->depth0
== 1 || templ
->depth0
== 0);
755 assert(templ
->height0
== 1 || templ
->height0
== 0);
757 struct r600_resource_global
* result
= (struct r600_resource_global
*)
758 CALLOC(sizeof(struct r600_resource_global
), 1);
759 struct r600_screen
* rscreen
= (struct r600_screen
*)screen
;
761 COMPUTE_DBG("*** r600_compute_global_buffer_create\n");
762 COMPUTE_DBG("width = %u array_size = %u\n", templ
->width0
,
765 result
->base
.b
.vtbl
= &r600_global_buffer_vtbl
;
766 result
->base
.b
.b
.screen
= screen
;
767 result
->base
.b
.b
= *templ
;
768 pipe_reference_init(&result
->base
.b
.b
.reference
, 1);
770 int size_in_dw
= (templ
->width0
+3) / 4;
772 result
->chunk
= compute_memory_alloc(rscreen
->global_pool
, size_in_dw
);
774 if (result
->chunk
== NULL
)
780 return &result
->base
.b
.b
;
783 void r600_compute_global_buffer_destroy(
784 struct pipe_screen
*screen
,
785 struct pipe_resource
*res
)
787 assert(res
->target
== PIPE_BUFFER
);
788 assert(res
->bind
& PIPE_BIND_GLOBAL
);
790 struct r600_resource_global
* buffer
= (struct r600_resource_global
*)res
;
791 struct r600_screen
* rscreen
= (struct r600_screen
*)screen
;
793 compute_memory_free(rscreen
->global_pool
, buffer
->chunk
->id
);
795 buffer
->chunk
= NULL
;
799 void* r600_compute_global_transfer_map(
800 struct pipe_context
*ctx_
,
801 struct pipe_transfer
* transfer
)
803 assert(transfer
->resource
->target
== PIPE_BUFFER
);
804 assert(transfer
->resource
->bind
& PIPE_BIND_GLOBAL
);
805 assert(transfer
->box
.x
>= 0);
806 assert(transfer
->box
.y
== 0);
807 assert(transfer
->box
.z
== 0);
809 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
810 struct r600_resource_global
* buffer
=
811 (struct r600_resource_global
*)transfer
->resource
;
814 ///TODO: do it better, mapping is not possible if the pool is too big
816 if (!(map
= ctx
->ws
->buffer_map(buffer
->chunk
->pool
->bo
->cs_buf
,
817 ctx
->cs
, transfer
->usage
))) {
821 COMPUTE_DBG("buffer start: %lli\n", buffer
->chunk
->start_in_dw
);
822 return ((char*)(map
+ buffer
->chunk
->start_in_dw
)) + transfer
->box
.x
;
825 void r600_compute_global_transfer_unmap(
826 struct pipe_context
*ctx_
,
827 struct pipe_transfer
* transfer
)
829 assert(transfer
->resource
->target
== PIPE_BUFFER
);
830 assert(transfer
->resource
->bind
& PIPE_BIND_GLOBAL
);
832 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
833 struct r600_resource_global
* buffer
=
834 (struct r600_resource_global
*)transfer
->resource
;
836 ctx
->ws
->buffer_unmap(buffer
->chunk
->pool
->bo
->cs_buf
);
839 struct pipe_transfer
* r600_compute_global_get_transfer(
840 struct pipe_context
*ctx_
,
841 struct pipe_resource
*resource
,
844 const struct pipe_box
*box
)
846 struct r600_context
*ctx
= (struct r600_context
*)ctx_
;
847 struct compute_memory_pool
*pool
= ctx
->screen
->global_pool
;
849 compute_memory_finalize_pending(pool
, ctx_
);
851 assert(resource
->target
== PIPE_BUFFER
);
852 struct r600_context
*rctx
= (struct r600_context
*)ctx_
;
853 struct pipe_transfer
*transfer
= util_slab_alloc(&rctx
->pool_transfers
);
855 transfer
->resource
= resource
;
856 transfer
->level
= level
;
857 transfer
->usage
= usage
;
858 transfer
->box
= *box
;
859 transfer
->stride
= 0;
860 transfer
->layer_stride
= 0;
861 transfer
->data
= NULL
;
863 /* Note strides are zero, this is ok for buffers, but not for
864 * textures 2d & higher at least.
869 void r600_compute_global_transfer_destroy(
870 struct pipe_context
*ctx_
,
871 struct pipe_transfer
*transfer
)
873 struct r600_context
*rctx
= (struct r600_context
*)ctx_
;
874 util_slab_free(&rctx
->pool_transfers
, transfer
);
877 void r600_compute_global_transfer_flush_region(
878 struct pipe_context
*ctx_
,
879 struct pipe_transfer
*transfer
,
880 const struct pipe_box
*box
)
885 void r600_compute_global_transfer_inline_write(
886 struct pipe_context
*pipe
,
887 struct pipe_resource
*resource
,
890 const struct pipe_box
*box
,
893 unsigned layer_stride
)