2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 * Adam Rak <adam.rak@streamnovation.com>
33 #include "pipe/p_defines.h"
34 #include "pipe/p_state.h"
35 #include "pipe/p_context.h"
36 #include "util/u_blitter.h"
37 #include "util/list.h"
38 #include "util/u_transfer.h"
39 #include "util/u_surface.h"
40 #include "util/u_pack_color.h"
41 #include "util/u_memory.h"
42 #include "util/u_inlines.h"
43 #include "util/u_framebuffer.h"
44 #include "pipebuffer/pb_buffer.h"
45 #include "evergreend.h"
46 #include "r600_shader.h"
47 #include "r600_pipe.h"
48 #include "r600_formats.h"
49 #include "evergreen_compute.h"
50 #include "evergreen_compute_internal.h"
51 #include "compute_memory_pool.h"
52 #include "sb/sb_public.h"
56 RAT0 is for global binding write
57 VTX1 is for global binding read
59 for wrting images RAT1...
60 for reading images TEX2...
63 TEX2... consumes the same fetch resources, that VTX2... would consume
65 CONST0 and VTX0 is for parameters
66 CONST0 is binding smaller input parameter buffer, and for constant indexing,
68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69 the constant cache can handle
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
79 so 10 for writing is enough. 176 is the max for reading according to the docs
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
86 struct r600_resource
*r600_compute_buffer_alloc_vram(struct r600_screen
*screen
,
89 struct pipe_resource
*buffer
= NULL
;
92 buffer
= pipe_buffer_create((struct pipe_screen
*) screen
,
93 0, PIPE_USAGE_IMMUTABLE
, size
);
95 return (struct r600_resource
*)buffer
;
99 static void evergreen_set_rat(struct r600_pipe_compute
*pipe
,
101 struct r600_resource
*bo
,
105 struct pipe_surface rat_templ
;
106 struct r600_surface
*surf
= NULL
;
107 struct r600_context
*rctx
= NULL
;
110 assert((size
& 3) == 0);
111 assert((start
& 0xFF) == 0);
115 COMPUTE_DBG(rctx
->screen
, "bind rat: %i \n", id
);
117 /* Create the RAT surface */
118 memset(&rat_templ
, 0, sizeof(rat_templ
));
119 rat_templ
.format
= PIPE_FORMAT_R32_UINT
;
120 rat_templ
.u
.tex
.level
= 0;
121 rat_templ
.u
.tex
.first_layer
= 0;
122 rat_templ
.u
.tex
.last_layer
= 0;
124 /* Add the RAT the list of color buffers */
125 pipe
->ctx
->framebuffer
.state
.cbufs
[id
] = pipe
->ctx
->b
.b
.create_surface(
126 (struct pipe_context
*)pipe
->ctx
,
127 (struct pipe_resource
*)bo
, &rat_templ
);
129 /* Update the number of color buffers */
130 pipe
->ctx
->framebuffer
.state
.nr_cbufs
=
131 MAX2(id
+ 1, pipe
->ctx
->framebuffer
.state
.nr_cbufs
);
133 /* Update the cb_target_mask
134 * XXX: I think this is a potential spot for bugs once we start doing
135 * GL interop. cb_target_mask may be modified in the 3D sections
137 pipe
->ctx
->compute_cb_target_mask
|= (0xf << (id
* 4));
139 surf
= (struct r600_surface
*)pipe
->ctx
->framebuffer
.state
.cbufs
[id
];
140 evergreen_init_color_surface_rat(rctx
, surf
);
143 static void evergreen_cs_set_vertex_buffer(struct r600_context
*rctx
,
146 struct pipe_resource
*buffer
)
148 struct r600_vertexbuf_state
*state
= &rctx
->cs_vertex_buffer_state
;
149 struct pipe_vertex_buffer
*vb
= &state
->vb
[vb_index
];
151 vb
->buffer_offset
= offset
;
152 vb
->buffer
.resource
= buffer
;
153 vb
->is_user_buffer
= false;
155 /* The vertex instructions in the compute shaders use the texture cache,
156 * so we need to invalidate it. */
157 rctx
->b
.flags
|= R600_CONTEXT_INV_VERTEX_CACHE
;
158 state
->enabled_mask
|= 1 << vb_index
;
159 state
->dirty_mask
|= 1 << vb_index
;
160 r600_mark_atom_dirty(rctx
, &state
->atom
);
163 static void evergreen_cs_set_constant_buffer(struct r600_context
*rctx
,
167 struct pipe_resource
*buffer
)
169 struct pipe_constant_buffer cb
;
170 cb
.buffer_size
= size
;
171 cb
.buffer_offset
= offset
;
173 cb
.user_buffer
= NULL
;
175 rctx
->b
.b
.set_constant_buffer(&rctx
->b
.b
, PIPE_SHADER_COMPUTE
, cb_index
, &cb
);
178 /* We need to define these R600 registers here, because we can't include
179 * evergreend.h and r600d.h.
181 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
182 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
185 static void parse_symbol_table(Elf_Data
*symbol_table_data
,
186 const GElf_Shdr
*symbol_table_header
,
187 struct ac_shader_binary
*binary
)
191 unsigned symbol_count
=
192 symbol_table_header
->sh_size
/ symbol_table_header
->sh_entsize
;
194 /* We are over allocating this list, because symbol_count gives the
195 * total number of symbols, and we will only be filling the list
196 * with offsets of global symbols. The memory savings from
197 * allocating the correct size of this list will be small, and
198 * I don't think it is worth the cost of pre-computing the number
201 binary
->global_symbol_offsets
= CALLOC(symbol_count
, sizeof(uint64_t));
203 while (gelf_getsym(symbol_table_data
, i
++, &symbol
)) {
205 if (GELF_ST_BIND(symbol
.st_info
) != STB_GLOBAL
||
206 symbol
.st_shndx
== 0 /* Undefined symbol */) {
210 binary
->global_symbol_offsets
[binary
->global_symbol_count
] =
213 /* Sort the list using bubble sort. This list will usually
215 for (i
= binary
->global_symbol_count
; i
> 0; --i
) {
216 uint64_t lhs
= binary
->global_symbol_offsets
[i
- 1];
217 uint64_t rhs
= binary
->global_symbol_offsets
[i
];
221 binary
->global_symbol_offsets
[i
] = lhs
;
222 binary
->global_symbol_offsets
[i
- 1] = rhs
;
224 ++binary
->global_symbol_count
;
229 static void parse_relocs(Elf
*elf
, Elf_Data
*relocs
, Elf_Data
*symbols
,
230 unsigned symbol_sh_link
,
231 struct ac_shader_binary
*binary
)
235 if (!relocs
|| !symbols
|| !binary
->reloc_count
) {
238 binary
->relocs
= CALLOC(binary
->reloc_count
,
239 sizeof(struct ac_shader_reloc
));
240 for (i
= 0; i
< binary
->reloc_count
; i
++) {
244 struct ac_shader_reloc
*reloc
= &binary
->relocs
[i
];
246 gelf_getrel(relocs
, i
, &rel
);
247 gelf_getsym(symbols
, GELF_R_SYM(rel
.r_info
), &symbol
);
248 symbol_name
= elf_strptr(elf
, symbol_sh_link
, symbol
.st_name
);
250 reloc
->offset
= rel
.r_offset
;
251 strncpy(reloc
->name
, symbol_name
, sizeof(reloc
->name
)-1);
252 reloc
->name
[sizeof(reloc
->name
)-1] = 0;
256 static void r600_elf_read(const char *elf_data
, unsigned elf_size
,
257 struct ac_shader_binary
*binary
)
261 Elf_Scn
*section
= NULL
;
262 Elf_Data
*symbols
= NULL
, *relocs
= NULL
;
263 size_t section_str_index
;
264 unsigned symbol_sh_link
= 0;
266 /* One of the libelf implementations
267 * (http://www.mr511.de/software/english.htm) requires calling
268 * elf_version() before elf_memory().
270 elf_version(EV_CURRENT
);
271 elf_buffer
= MALLOC(elf_size
);
272 memcpy(elf_buffer
, elf_data
, elf_size
);
274 elf
= elf_memory(elf_buffer
, elf_size
);
276 elf_getshdrstrndx(elf
, §ion_str_index
);
278 while ((section
= elf_nextscn(elf
, section
))) {
280 Elf_Data
*section_data
= NULL
;
281 GElf_Shdr section_header
;
282 if (gelf_getshdr(section
, §ion_header
) != §ion_header
) {
283 fprintf(stderr
, "Failed to read ELF section header\n");
286 name
= elf_strptr(elf
, section_str_index
, section_header
.sh_name
);
287 if (!strcmp(name
, ".text")) {
288 section_data
= elf_getdata(section
, section_data
);
289 binary
->code_size
= section_data
->d_size
;
290 binary
->code
= MALLOC(binary
->code_size
* sizeof(unsigned char));
291 memcpy(binary
->code
, section_data
->d_buf
, binary
->code_size
);
292 } else if (!strcmp(name
, ".AMDGPU.config")) {
293 section_data
= elf_getdata(section
, section_data
);
294 binary
->config_size
= section_data
->d_size
;
295 binary
->config
= MALLOC(binary
->config_size
* sizeof(unsigned char));
296 memcpy(binary
->config
, section_data
->d_buf
, binary
->config_size
);
297 } else if (!strcmp(name
, ".AMDGPU.disasm")) {
298 /* Always read disassembly if it's available. */
299 section_data
= elf_getdata(section
, section_data
);
300 binary
->disasm_string
= strndup(section_data
->d_buf
,
301 section_data
->d_size
);
302 } else if (!strncmp(name
, ".rodata", 7)) {
303 section_data
= elf_getdata(section
, section_data
);
304 binary
->rodata_size
= section_data
->d_size
;
305 binary
->rodata
= MALLOC(binary
->rodata_size
* sizeof(unsigned char));
306 memcpy(binary
->rodata
, section_data
->d_buf
, binary
->rodata_size
);
307 } else if (!strncmp(name
, ".symtab", 7)) {
308 symbols
= elf_getdata(section
, section_data
);
309 symbol_sh_link
= section_header
.sh_link
;
310 parse_symbol_table(symbols
, §ion_header
, binary
);
311 } else if (!strcmp(name
, ".rel.text")) {
312 relocs
= elf_getdata(section
, section_data
);
313 binary
->reloc_count
= section_header
.sh_size
/
314 section_header
.sh_entsize
;
318 parse_relocs(elf
, relocs
, symbols
, symbol_sh_link
, binary
);
325 /* Cache the config size per symbol */
326 if (binary
->global_symbol_count
) {
327 binary
->config_size_per_symbol
=
328 binary
->config_size
/ binary
->global_symbol_count
;
330 binary
->global_symbol_count
= 1;
331 binary
->config_size_per_symbol
= binary
->config_size
;
335 static const unsigned char *r600_shader_binary_config_start(
336 const struct ac_shader_binary
*binary
,
337 uint64_t symbol_offset
)
340 for (i
= 0; i
< binary
->global_symbol_count
; ++i
) {
341 if (binary
->global_symbol_offsets
[i
] == symbol_offset
) {
342 unsigned offset
= i
* binary
->config_size_per_symbol
;
343 return binary
->config
+ offset
;
346 return binary
->config
;
349 static void r600_shader_binary_read_config(const struct ac_shader_binary
*binary
,
350 struct r600_bytecode
*bc
,
351 uint64_t symbol_offset
,
355 const unsigned char *config
=
356 r600_shader_binary_config_start(binary
, symbol_offset
);
358 for (i
= 0; i
< binary
->config_size_per_symbol
; i
+= 8) {
360 util_le32_to_cpu(*(uint32_t*)(config
+ i
));
362 util_le32_to_cpu(*(uint32_t*)(config
+ i
+ 4));
365 case R_028850_SQ_PGM_RESOURCES_PS
:
366 case R_028868_SQ_PGM_RESOURCES_VS
:
367 /* Evergreen / Northern Islands */
368 case R_028844_SQ_PGM_RESOURCES_PS
:
369 case R_028860_SQ_PGM_RESOURCES_VS
:
370 case R_0288D4_SQ_PGM_RESOURCES_LS
:
371 bc
->ngpr
= MAX2(bc
->ngpr
, G_028844_NUM_GPRS(value
));
372 bc
->nstack
= MAX2(bc
->nstack
, G_028844_STACK_SIZE(value
));
374 case R_02880C_DB_SHADER_CONTROL
:
375 *use_kill
= G_02880C_KILL_ENABLE(value
);
377 case R_0288E8_SQ_LDS_ALLOC
:
384 static unsigned r600_create_shader(struct r600_bytecode
*bc
,
385 const struct ac_shader_binary
*binary
,
389 assert(binary
->code_size
% 4 == 0);
390 bc
->bytecode
= CALLOC(1, binary
->code_size
);
391 memcpy(bc
->bytecode
, binary
->code
, binary
->code_size
);
392 bc
->ndw
= binary
->code_size
/ 4;
394 r600_shader_binary_read_config(binary
, bc
, 0, use_kill
);
400 static void r600_destroy_shader(struct r600_bytecode
*bc
)
405 static void *evergreen_create_compute_state(struct pipe_context
*ctx
,
406 const struct pipe_compute_state
*cso
)
408 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
409 struct r600_pipe_compute
*shader
= CALLOC_STRUCT(r600_pipe_compute
);
411 const struct pipe_llvm_program_header
*header
;
416 COMPUTE_DBG(rctx
->screen
, "*** evergreen_create_compute_state\n");
418 code
= cso
->prog
+ sizeof(struct pipe_llvm_program_header
);
419 radeon_shader_binary_init(&shader
->binary
);
420 r600_elf_read(code
, header
->num_bytes
, &shader
->binary
);
421 r600_create_shader(&shader
->bc
, &shader
->binary
, &use_kill
);
423 /* Upload code + ROdata */
424 shader
->code_bo
= r600_compute_buffer_alloc_vram(rctx
->screen
,
426 p
= r600_buffer_map_sync_with_rings(&rctx
->b
, shader
->code_bo
, PIPE_TRANSFER_WRITE
);
427 //TODO: use util_memcpy_cpu_to_le32 ?
428 memcpy(p
, shader
->bc
.bytecode
, shader
->bc
.ndw
* 4);
429 rctx
->b
.ws
->buffer_unmap(shader
->code_bo
->buf
);
433 shader
->local_size
= cso
->req_local_mem
;
434 shader
->private_size
= cso
->req_private_mem
;
435 shader
->input_size
= cso
->req_input_mem
;
440 static void evergreen_delete_compute_state(struct pipe_context
*ctx
, void *state
)
442 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
443 struct r600_pipe_compute
*shader
= state
;
445 COMPUTE_DBG(rctx
->screen
, "*** evergreen_delete_compute_state\n");
451 radeon_shader_binary_clean(&shader
->binary
);
453 r600_destroy_shader(&shader
->bc
);
455 /* TODO destroy shader->code_bo, shader->const_bo
456 * we'll need something like r600_buffer_free */
460 static void evergreen_bind_compute_state(struct pipe_context
*ctx
, void *state
)
462 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
464 COMPUTE_DBG(rctx
->screen
, "*** evergreen_bind_compute_state\n");
466 rctx
->cs_shader_state
.shader
= (struct r600_pipe_compute
*)state
;
469 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
470 * kernel parameters there are implicit parameters that need to be stored
471 * in the vertex buffer as well. Here is how these parameters are organized in
474 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
475 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
476 * DWORDS 6-8: Number of work items within each work group in each dimension
478 * DWORDS 9+ : Kernel parameters
480 static void evergreen_compute_upload_input(struct pipe_context
*ctx
,
481 const struct pipe_grid_info
*info
)
483 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
484 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
486 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
489 unsigned input_size
= shader
->input_size
+ 36;
490 uint32_t *num_work_groups_start
;
491 uint32_t *global_size_start
;
492 uint32_t *local_size_start
;
493 uint32_t *kernel_parameters_start
;
495 struct pipe_transfer
*transfer
= NULL
;
497 if (shader
->input_size
== 0) {
501 if (!shader
->kernel_param
) {
502 /* Add space for the grid dimensions */
503 shader
->kernel_param
= (struct r600_resource
*)
504 pipe_buffer_create(ctx
->screen
, 0,
505 PIPE_USAGE_IMMUTABLE
, input_size
);
508 u_box_1d(0, input_size
, &box
);
509 num_work_groups_start
= ctx
->transfer_map(ctx
,
510 (struct pipe_resource
*)shader
->kernel_param
,
511 0, PIPE_TRANSFER_WRITE
| PIPE_TRANSFER_DISCARD_RANGE
,
513 global_size_start
= num_work_groups_start
+ (3 * (sizeof(uint
) /4));
514 local_size_start
= global_size_start
+ (3 * (sizeof(uint
)) / 4);
515 kernel_parameters_start
= local_size_start
+ (3 * (sizeof(uint
)) / 4);
517 /* Copy the work group size */
518 memcpy(num_work_groups_start
, info
->grid
, 3 * sizeof(uint
));
520 /* Copy the global size */
521 for (i
= 0; i
< 3; i
++) {
522 global_size_start
[i
] = info
->grid
[i
] * info
->block
[i
];
525 /* Copy the local dimensions */
526 memcpy(local_size_start
, info
->block
, 3 * sizeof(uint
));
528 /* Copy the kernel inputs */
529 memcpy(kernel_parameters_start
, info
->input
, shader
->input_size
);
531 for (i
= 0; i
< (input_size
/ 4); i
++) {
532 COMPUTE_DBG(rctx
->screen
, "input %i : %u\n", i
,
533 ((unsigned*)num_work_groups_start
)[i
]);
536 ctx
->transfer_unmap(ctx
, transfer
);
538 /* ID=0 and ID=3 are reserved for the parameters.
539 * LLVM will preferably use ID=0, but it does not work for dynamic
541 evergreen_cs_set_vertex_buffer(rctx
, 3, 0,
542 (struct pipe_resource
*)shader
->kernel_param
);
543 evergreen_cs_set_constant_buffer(rctx
, 0, 0, input_size
,
544 (struct pipe_resource
*)shader
->kernel_param
);
547 static void evergreen_emit_dispatch(struct r600_context
*rctx
,
548 const struct pipe_grid_info
*info
)
551 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
552 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
554 unsigned num_pipes
= rctx
->screen
->b
.info
.r600_max_quad_pipes
;
555 unsigned wave_divisor
= (16 * num_pipes
);
558 unsigned lds_size
= shader
->local_size
/ 4 +
562 /* Calculate group_size/grid_size */
563 for (i
= 0; i
< 3; i
++) {
564 group_size
*= info
->block
[i
];
567 for (i
= 0; i
< 3; i
++) {
568 grid_size
*= info
->grid
[i
];
571 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
572 num_waves
= (info
->block
[0] * info
->block
[1] * info
->block
[2] +
573 wave_divisor
- 1) / wave_divisor
;
575 COMPUTE_DBG(rctx
->screen
, "Using %u pipes, "
576 "%u wavefronts per thread block, "
577 "allocating %u dwords lds.\n",
578 num_pipes
, num_waves
, lds_size
);
580 radeon_set_config_reg(cs
, R_008970_VGT_NUM_INDICES
, group_size
);
582 radeon_set_config_reg_seq(cs
, R_00899C_VGT_COMPUTE_START_X
, 3);
583 radeon_emit(cs
, 0); /* R_00899C_VGT_COMPUTE_START_X */
584 radeon_emit(cs
, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
585 radeon_emit(cs
, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
587 radeon_set_config_reg(cs
, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE
,
590 radeon_compute_set_context_reg_seq(cs
, R_0286EC_SPI_COMPUTE_NUM_THREAD_X
, 3);
591 radeon_emit(cs
, info
->block
[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
592 radeon_emit(cs
, info
->block
[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
593 radeon_emit(cs
, info
->block
[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
595 if (rctx
->b
.chip_class
< CAYMAN
) {
596 assert(lds_size
<= 8192);
598 /* Cayman appears to have a slightly smaller limit, see the
599 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
600 assert(lds_size
<= 8160);
603 radeon_compute_set_context_reg(cs
, R_0288E8_SQ_LDS_ALLOC
,
604 lds_size
| (num_waves
<< 14));
606 /* Dispatch packet */
607 radeon_emit(cs
, PKT3C(PKT3_DISPATCH_DIRECT
, 3, 0));
608 radeon_emit(cs
, info
->grid
[0]);
609 radeon_emit(cs
, info
->grid
[1]);
610 radeon_emit(cs
, info
->grid
[2]);
611 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
618 static void compute_emit_cs(struct r600_context
*rctx
,
619 const struct pipe_grid_info
*info
)
621 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
624 /* make sure that the gfx ring is only one active */
625 if (radeon_emitted(rctx
->b
.dma
.cs
, 0)) {
626 rctx
->b
.dma
.flush(rctx
, PIPE_FLUSH_ASYNC
, NULL
);
629 /* Initialize all the compute-related registers.
631 * See evergreen_init_atom_start_compute_cs() in this file for the list
632 * of registers initialized by the start_compute_cs_cmd atom.
634 r600_emit_command_buffer(cs
, &rctx
->start_compute_cs_cmd
);
636 /* emit config state */
637 if (rctx
->b
.chip_class
== EVERGREEN
)
638 r600_emit_atom(rctx
, &rctx
->config_state
.atom
);
640 rctx
->b
.flags
|= R600_CONTEXT_WAIT_3D_IDLE
| R600_CONTEXT_FLUSH_AND_INV
;
641 r600_flush_emit(rctx
);
643 /* Emit colorbuffers. */
644 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
645 for (i
= 0; i
< 8 && i
< rctx
->framebuffer
.state
.nr_cbufs
; i
++) {
646 struct r600_surface
*cb
= (struct r600_surface
*)rctx
->framebuffer
.state
.cbufs
[i
];
647 unsigned reloc
= radeon_add_to_buffer_list(&rctx
->b
, &rctx
->b
.gfx
,
648 (struct r600_resource
*)cb
->base
.texture
,
649 RADEON_USAGE_READWRITE
,
650 RADEON_PRIO_SHADER_RW_BUFFER
);
652 radeon_compute_set_context_reg_seq(cs
, R_028C60_CB_COLOR0_BASE
+ i
* 0x3C, 7);
653 radeon_emit(cs
, cb
->cb_color_base
); /* R_028C60_CB_COLOR0_BASE */
654 radeon_emit(cs
, cb
->cb_color_pitch
); /* R_028C64_CB_COLOR0_PITCH */
655 radeon_emit(cs
, cb
->cb_color_slice
); /* R_028C68_CB_COLOR0_SLICE */
656 radeon_emit(cs
, cb
->cb_color_view
); /* R_028C6C_CB_COLOR0_VIEW */
657 radeon_emit(cs
, cb
->cb_color_info
); /* R_028C70_CB_COLOR0_INFO */
658 radeon_emit(cs
, cb
->cb_color_attrib
); /* R_028C74_CB_COLOR0_ATTRIB */
659 radeon_emit(cs
, cb
->cb_color_dim
); /* R_028C78_CB_COLOR0_DIM */
661 radeon_emit(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
662 radeon_emit(cs
, reloc
);
664 radeon_emit(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
665 radeon_emit(cs
, reloc
);
668 radeon_compute_set_context_reg(cs
, R_028C70_CB_COLOR0_INFO
+ i
* 0x3C,
669 S_028C70_FORMAT(V_028C70_COLOR_INVALID
));
671 radeon_compute_set_context_reg(cs
, R_028E50_CB_COLOR8_INFO
+ (i
- 8) * 0x1C,
672 S_028C70_FORMAT(V_028C70_COLOR_INVALID
));
674 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
675 radeon_compute_set_context_reg(cs
, R_028238_CB_TARGET_MASK
,
676 rctx
->compute_cb_target_mask
);
679 /* Emit vertex buffer state */
680 rctx
->cs_vertex_buffer_state
.atom
.num_dw
= 12 * util_bitcount(rctx
->cs_vertex_buffer_state
.dirty_mask
);
681 r600_emit_atom(rctx
, &rctx
->cs_vertex_buffer_state
.atom
);
683 /* Emit constant buffer state */
684 r600_emit_atom(rctx
, &rctx
->constbuf_state
[PIPE_SHADER_COMPUTE
].atom
);
686 /* Emit sampler state */
687 r600_emit_atom(rctx
, &rctx
->samplers
[PIPE_SHADER_COMPUTE
].states
.atom
);
689 /* Emit sampler view (texture resource) state */
690 r600_emit_atom(rctx
, &rctx
->samplers
[PIPE_SHADER_COMPUTE
].views
.atom
);
692 /* Emit compute shader state */
693 r600_emit_atom(rctx
, &rctx
->cs_shader_state
.atom
);
695 /* Emit dispatch state and dispatch packet */
696 evergreen_emit_dispatch(rctx
, info
);
698 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
700 rctx
->b
.flags
|= R600_CONTEXT_INV_CONST_CACHE
|
701 R600_CONTEXT_INV_VERTEX_CACHE
|
702 R600_CONTEXT_INV_TEX_CACHE
;
703 r600_flush_emit(rctx
);
706 if (rctx
->b
.chip_class
>= CAYMAN
) {
707 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
708 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH
) | EVENT_INDEX(4));
709 /* DEALLOC_STATE prevents the GPU from hanging when a
710 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
711 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
713 radeon_emit(cs
, PKT3C(PKT3_DEALLOC_STATE
, 0, 0));
718 COMPUTE_DBG(rctx
->screen
, "cdw: %i\n", cs
->cdw
);
719 for (i
= 0; i
< cs
->cdw
; i
++) {
720 COMPUTE_DBG(rctx
->screen
, "%4i : 0x%08X\n", i
, cs
->buf
[i
]);
728 * Emit function for r600_cs_shader_state atom
730 void evergreen_emit_cs_shader(struct r600_context
*rctx
,
731 struct r600_atom
*atom
)
733 struct r600_cs_shader_state
*state
=
734 (struct r600_cs_shader_state
*)atom
;
735 struct r600_pipe_compute
*shader
= state
->shader
;
736 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
738 struct r600_resource
*code_bo
;
739 unsigned ngpr
, nstack
;
741 code_bo
= shader
->code_bo
;
742 va
= shader
->code_bo
->gpu_address
+ state
->pc
;
743 ngpr
= shader
->bc
.ngpr
;
744 nstack
= shader
->bc
.nstack
;
746 radeon_compute_set_context_reg_seq(cs
, R_0288D0_SQ_PGM_START_LS
, 3);
747 radeon_emit(cs
, va
>> 8); /* R_0288D0_SQ_PGM_START_LS */
748 radeon_emit(cs
, /* R_0288D4_SQ_PGM_RESOURCES_LS */
749 S_0288D4_NUM_GPRS(ngpr
) |
750 S_0288D4_DX10_CLAMP(1) |
751 S_0288D4_STACK_SIZE(nstack
));
752 radeon_emit(cs
, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
754 radeon_emit(cs
, PKT3C(PKT3_NOP
, 0, 0));
755 radeon_emit(cs
, radeon_add_to_buffer_list(&rctx
->b
, &rctx
->b
.gfx
,
756 code_bo
, RADEON_USAGE_READ
,
757 RADEON_PRIO_SHADER_BINARY
));
760 static void evergreen_launch_grid(struct pipe_context
*ctx
,
761 const struct pipe_grid_info
*info
)
763 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
765 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
768 rctx
->cs_shader_state
.pc
= info
->pc
;
769 /* Get the config information for this kernel. */
770 r600_shader_binary_read_config(&shader
->binary
, &shader
->bc
,
771 info
->pc
, &use_kill
);
774 COMPUTE_DBG(rctx
->screen
, "*** evergreen_launch_grid: pc = %u\n", info
->pc
);
777 evergreen_compute_upload_input(ctx
, info
);
778 compute_emit_cs(rctx
, info
);
781 static void evergreen_set_compute_resources(struct pipe_context
*ctx
,
782 unsigned start
, unsigned count
,
783 struct pipe_surface
**surfaces
)
785 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
786 struct r600_surface
**resources
= (struct r600_surface
**)surfaces
;
788 COMPUTE_DBG(rctx
->screen
, "*** evergreen_set_compute_resources: start = %u count = %u\n",
791 for (unsigned i
= 0; i
< count
; i
++) {
792 /* The First four vertex buffers are reserved for parameters and
794 unsigned vtx_id
= 4 + i
;
796 struct r600_resource_global
*buffer
=
797 (struct r600_resource_global
*)
798 resources
[i
]->base
.texture
;
799 if (resources
[i
]->base
.writable
) {
802 evergreen_set_rat(rctx
->cs_shader_state
.shader
, i
+1,
803 (struct r600_resource
*)resources
[i
]->base
.texture
,
804 buffer
->chunk
->start_in_dw
*4,
805 resources
[i
]->base
.texture
->width0
);
808 evergreen_cs_set_vertex_buffer(rctx
, vtx_id
,
809 buffer
->chunk
->start_in_dw
* 4,
810 resources
[i
]->base
.texture
);
815 static void evergreen_set_global_binding(struct pipe_context
*ctx
,
816 unsigned first
, unsigned n
,
817 struct pipe_resource
**resources
,
820 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
821 struct compute_memory_pool
*pool
= rctx
->screen
->global_pool
;
822 struct r600_resource_global
**buffers
=
823 (struct r600_resource_global
**)resources
;
826 COMPUTE_DBG(rctx
->screen
, "*** evergreen_set_global_binding first = %u n = %u\n",
834 /* We mark these items for promotion to the pool if they
835 * aren't already there */
836 for (i
= first
; i
< first
+ n
; i
++) {
837 struct compute_memory_item
*item
= buffers
[i
]->chunk
;
839 if (!is_item_in_pool(item
))
840 buffers
[i
]->chunk
->status
|= ITEM_FOR_PROMOTING
;
843 if (compute_memory_finalize_pending(pool
, ctx
) == -1) {
848 for (i
= first
; i
< first
+ n
; i
++)
850 uint32_t buffer_offset
;
852 assert(resources
[i
]->target
== PIPE_BUFFER
);
853 assert(resources
[i
]->bind
& PIPE_BIND_GLOBAL
);
855 buffer_offset
= util_le32_to_cpu(*(handles
[i
]));
856 handle
= buffer_offset
+ buffers
[i
]->chunk
->start_in_dw
* 4;
858 *(handles
[i
]) = util_cpu_to_le32(handle
);
861 /* globals for writing */
862 evergreen_set_rat(rctx
->cs_shader_state
.shader
, 0, pool
->bo
, 0, pool
->size_in_dw
* 4);
863 /* globals for reading */
864 evergreen_cs_set_vertex_buffer(rctx
, 1, 0,
865 (struct pipe_resource
*)pool
->bo
);
867 /* constants for reading, LLVM puts them in text segment */
868 evergreen_cs_set_vertex_buffer(rctx
, 2, 0,
869 (struct pipe_resource
*)rctx
->cs_shader_state
.shader
->code_bo
);
873 * This function initializes all the compute specific registers that need to
874 * be initialized for each compute command stream. Registers that are common
875 * to both compute and 3D will be initialized at the beginning of each compute
876 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
877 * packet requires that the shader type bit be set, we must initialize all
878 * context registers needed for compute in this function. The registers
879 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
880 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
883 void evergreen_init_atom_start_compute_cs(struct r600_context
*rctx
)
885 struct r600_command_buffer
*cb
= &rctx
->start_compute_cs_cmd
;
887 int num_stack_entries
;
889 /* since all required registers are initialized in the
890 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
892 r600_init_command_buffer(cb
, 256);
893 cb
->pkt_flags
= RADEON_CP_PACKET3_COMPUTE_MODE
;
895 /* This must be first. */
896 r600_store_value(cb
, PKT3(PKT3_CONTEXT_CONTROL
, 1, 0));
897 r600_store_value(cb
, 0x80000000);
898 r600_store_value(cb
, 0x80000000);
900 /* We're setting config registers here. */
901 r600_store_value(cb
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
902 r600_store_value(cb
, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH
) | EVENT_INDEX(4));
904 switch (rctx
->b
.family
) {
908 num_stack_entries
= 256;
912 num_stack_entries
= 256;
916 num_stack_entries
= 512;
921 num_stack_entries
= 512;
925 num_stack_entries
= 256;
929 num_stack_entries
= 256;
933 num_stack_entries
= 512;
937 num_stack_entries
= 512;
941 num_stack_entries
= 256;
945 num_stack_entries
= 256;
949 /* Config Registers */
950 if (rctx
->b
.chip_class
< CAYMAN
)
951 evergreen_init_common_regs(rctx
, cb
, rctx
->b
.chip_class
, rctx
->b
.family
,
952 rctx
->screen
->b
.info
.drm_minor
);
954 cayman_init_common_regs(cb
, rctx
->b
.chip_class
, rctx
->b
.family
,
955 rctx
->screen
->b
.info
.drm_minor
);
957 /* The primitive type always needs to be POINTLIST for compute. */
958 r600_store_config_reg(cb
, R_008958_VGT_PRIMITIVE_TYPE
,
959 V_008958_DI_PT_POINTLIST
);
961 if (rctx
->b
.chip_class
< CAYMAN
) {
963 /* These registers control which simds can be used by each stage.
964 * The default for these registers is 0xffffffff, which means
965 * all simds are available for each stage. It's possible we may
966 * want to play around with these in the future, but for now
967 * the default value is fine.
969 * R_008E20_SQ_STATIC_THREAD_MGMT1
970 * R_008E24_SQ_STATIC_THREAD_MGMT2
971 * R_008E28_SQ_STATIC_THREAD_MGMT3
974 /* XXX: We may need to adjust the thread and stack resource
975 * values for 3D/compute interop */
977 r600_store_config_reg_seq(cb
, R_008C18_SQ_THREAD_RESOURCE_MGMT_1
, 5);
979 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
980 * Set the number of threads used by the PS/VS/GS/ES stage to
983 r600_store_value(cb
, 0);
985 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
986 * Set the number of threads used by the CS (aka LS) stage to
987 * the maximum number of threads and set the number of threads
988 * for the HS stage to 0. */
989 r600_store_value(cb
, S_008C1C_NUM_LS_THREADS(num_threads
));
991 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
992 * Set the Control Flow stack entries to 0 for PS/VS stages */
993 r600_store_value(cb
, 0);
995 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
996 * Set the Control Flow stack entries to 0 for GS/ES stages */
997 r600_store_value(cb
, 0);
999 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1000 * Set the Contol Flow stack entries to 0 for the HS stage, and
1001 * set it to the maximum value for the CS (aka LS) stage. */
1002 r600_store_value(cb
,
1003 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries
));
1005 /* Give the compute shader all the available LDS space.
1006 * NOTE: This only sets the maximum number of dwords that a compute
1007 * shader can allocate. When a shader is executed, we still need to
1008 * allocate the appropriate amount of LDS dwords using the
1009 * CM_R_0288E8_SQ_LDS_ALLOC register.
1011 if (rctx
->b
.chip_class
< CAYMAN
) {
1012 r600_store_config_reg(cb
, R_008E2C_SQ_LDS_RESOURCE_MGMT
,
1013 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1015 r600_store_context_reg(cb
, CM_R_0286FC_SPI_LDS_MGMT
,
1016 S_0286FC_NUM_PS_LDS(0) |
1017 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1020 /* Context Registers */
1022 if (rctx
->b
.chip_class
< CAYMAN
) {
1023 /* workaround for hw issues with dyn gpr - must set all limits
1024 * to 240 instead of 0, 0x1e == 240 / 8
1026 r600_store_context_reg(cb
, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1
,
1027 S_028838_PS_GPRS(0x1e) |
1028 S_028838_VS_GPRS(0x1e) |
1029 S_028838_GS_GPRS(0x1e) |
1030 S_028838_ES_GPRS(0x1e) |
1031 S_028838_HS_GPRS(0x1e) |
1032 S_028838_LS_GPRS(0x1e));
1035 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1036 r600_store_context_reg(cb
, R_028A40_VGT_GS_MODE
,
1037 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1039 r600_store_context_reg(cb
, R_028B54_VGT_SHADER_STAGES_EN
, 2/*CS_ON*/);
1041 r600_store_context_reg(cb
, R_0286E8_SPI_COMPUTE_INPUT_CNTL
,
1042 S_0286E8_TID_IN_GROUP_ENA(1) |
1043 S_0286E8_TGID_ENA(1) |
1044 S_0286E8_DISABLE_INDEX_PACK(1));
1046 /* The LOOP_CONST registers are an optimizations for loops that allows
1047 * you to store the initial counter, increment value, and maximum
1048 * counter value in a register so that hardware can calculate the
1049 * correct number of iterations for the loop, so that you don't need
1050 * to have the loop counter in your shader code. We don't currently use
1051 * this optimization, so we must keep track of the counter in the
1052 * shader and use a break instruction to exit loops. However, the
1053 * hardware will still uses this register to determine when to exit a
1054 * loop, so we need to initialize the counter to 0, set the increment
1055 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1056 * is the maximum value allowed. This gives us a maximum of 4096
1057 * iterations for our loops, but hopefully our break instruction will
1058 * execute before some time before the 4096th iteration.
1060 eg_store_loop_const(cb
, R_03A200_SQ_LOOP_CONST_0
+ (160 * 4), 0x1000FFF);
1063 void evergreen_init_compute_state_functions(struct r600_context
*rctx
)
1065 rctx
->b
.b
.create_compute_state
= evergreen_create_compute_state
;
1066 rctx
->b
.b
.delete_compute_state
= evergreen_delete_compute_state
;
1067 rctx
->b
.b
.bind_compute_state
= evergreen_bind_compute_state
;
1068 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1069 rctx
->b
.b
.set_compute_resources
= evergreen_set_compute_resources
;
1070 rctx
->b
.b
.set_global_binding
= evergreen_set_global_binding
;
1071 rctx
->b
.b
.launch_grid
= evergreen_launch_grid
;
1075 static void *r600_compute_global_transfer_map(struct pipe_context
*ctx
,
1076 struct pipe_resource
*resource
,
1079 const struct pipe_box
*box
,
1080 struct pipe_transfer
**ptransfer
)
1082 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
1083 struct compute_memory_pool
*pool
= rctx
->screen
->global_pool
;
1084 struct r600_resource_global
* buffer
=
1085 (struct r600_resource_global
*)resource
;
1087 struct compute_memory_item
*item
= buffer
->chunk
;
1088 struct pipe_resource
*dst
= NULL
;
1089 unsigned offset
= box
->x
;
1091 if (is_item_in_pool(item
)) {
1092 compute_memory_demote_item(pool
, item
, ctx
);
1095 if (item
->real_buffer
== NULL
) {
1097 r600_compute_buffer_alloc_vram(pool
->screen
, item
->size_in_dw
* 4);
1101 dst
= (struct pipe_resource
*)item
->real_buffer
;
1103 if (usage
& PIPE_TRANSFER_READ
)
1104 buffer
->chunk
->status
|= ITEM_MAPPED_FOR_READING
;
1106 COMPUTE_DBG(rctx
->screen
, "* r600_compute_global_transfer_map()\n"
1107 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1108 "width = %u, height = %u, depth = %u)\n", level
, usage
,
1109 box
->x
, box
->y
, box
->z
, box
->width
, box
->height
,
1111 COMPUTE_DBG(rctx
->screen
, "Buffer id = %"PRIi64
" offset = "
1112 "%u (box.x)\n", item
->id
, box
->x
);
1115 assert(resource
->target
== PIPE_BUFFER
);
1116 assert(resource
->bind
& PIPE_BIND_GLOBAL
);
1117 assert(box
->x
>= 0);
1118 assert(box
->y
== 0);
1119 assert(box
->z
== 0);
1121 ///TODO: do it better, mapping is not possible if the pool is too big
1122 return pipe_buffer_map_range(ctx
, dst
,
1123 offset
, box
->width
, usage
, ptransfer
);
1126 static void r600_compute_global_transfer_unmap(struct pipe_context
*ctx
,
1127 struct pipe_transfer
*transfer
)
1129 /* struct r600_resource_global are not real resources, they just map
1130 * to an offset within the compute memory pool. The function
1131 * r600_compute_global_transfer_map() maps the memory pool
1132 * resource rather than the struct r600_resource_global passed to
1133 * it as an argument and then initalizes ptransfer->resource with
1134 * the memory pool resource (via pipe_buffer_map_range).
1135 * When transfer_unmap is called it uses the memory pool's
1136 * vtable which calls r600_buffer_transfer_map() rather than
1139 assert (!"This function should not be called");
1142 static void r600_compute_global_transfer_flush_region(struct pipe_context
*ctx
,
1143 struct pipe_transfer
*transfer
,
1144 const struct pipe_box
*box
)
1146 assert(0 && "TODO");
1149 static void r600_compute_global_buffer_destroy(struct pipe_screen
*screen
,
1150 struct pipe_resource
*res
)
1152 struct r600_resource_global
* buffer
= NULL
;
1153 struct r600_screen
* rscreen
= NULL
;
1155 assert(res
->target
== PIPE_BUFFER
);
1156 assert(res
->bind
& PIPE_BIND_GLOBAL
);
1158 buffer
= (struct r600_resource_global
*)res
;
1159 rscreen
= (struct r600_screen
*)screen
;
1161 compute_memory_free(rscreen
->global_pool
, buffer
->chunk
->id
);
1163 buffer
->chunk
= NULL
;
1167 static const struct u_resource_vtbl r600_global_buffer_vtbl
=
1169 u_default_resource_get_handle
, /* get_handle */
1170 r600_compute_global_buffer_destroy
, /* resource_destroy */
1171 r600_compute_global_transfer_map
, /* transfer_map */
1172 r600_compute_global_transfer_flush_region
,/* transfer_flush_region */
1173 r600_compute_global_transfer_unmap
, /* transfer_unmap */
1176 struct pipe_resource
*r600_compute_global_buffer_create(struct pipe_screen
*screen
,
1177 const struct pipe_resource
*templ
)
1179 struct r600_resource_global
* result
= NULL
;
1180 struct r600_screen
* rscreen
= NULL
;
1183 assert(templ
->target
== PIPE_BUFFER
);
1184 assert(templ
->bind
& PIPE_BIND_GLOBAL
);
1185 assert(templ
->array_size
== 1 || templ
->array_size
== 0);
1186 assert(templ
->depth0
== 1 || templ
->depth0
== 0);
1187 assert(templ
->height0
== 1 || templ
->height0
== 0);
1189 result
= (struct r600_resource_global
*)
1190 CALLOC(sizeof(struct r600_resource_global
), 1);
1191 rscreen
= (struct r600_screen
*)screen
;
1193 COMPUTE_DBG(rscreen
, "*** r600_compute_global_buffer_create\n");
1194 COMPUTE_DBG(rscreen
, "width = %u array_size = %u\n", templ
->width0
,
1197 result
->base
.b
.vtbl
= &r600_global_buffer_vtbl
;
1198 result
->base
.b
.b
= *templ
;
1199 result
->base
.b
.b
.screen
= screen
;
1200 pipe_reference_init(&result
->base
.b
.b
.reference
, 1);
1202 size_in_dw
= (templ
->width0
+3) / 4;
1204 result
->chunk
= compute_memory_alloc(rscreen
->global_pool
, size_in_dw
);
1206 if (result
->chunk
== NULL
)
1212 return &result
->base
.b
.b
;