2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 * Adam Rak <adam.rak@streamnovation.com>
31 #include "pipe/p_defines.h"
32 #include "pipe/p_state.h"
33 #include "pipe/p_context.h"
34 #include "util/u_blitter.h"
35 #include "util/list.h"
36 #include "util/u_transfer.h"
37 #include "util/u_surface.h"
38 #include "util/u_pack_color.h"
39 #include "util/u_memory.h"
40 #include "util/u_inlines.h"
41 #include "util/u_framebuffer.h"
42 #include "pipebuffer/pb_buffer.h"
43 #include "evergreend.h"
44 #include "r600_shader.h"
45 #include "r600_pipe.h"
46 #include "r600_formats.h"
47 #include "evergreen_compute.h"
48 #include "evergreen_compute_internal.h"
49 #include "compute_memory_pool.h"
50 #include "sb/sb_public.h"
54 RAT0 is for global binding write
55 VTX1 is for global binding read
57 for wrting images RAT1...
58 for reading images TEX2...
61 TEX2... consumes the same fetch resources, that VTX2... would consume
63 CONST0 and VTX0 is for parameters
64 CONST0 is binding smaller input parameter buffer, and for constant indexing,
66 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
67 the constant cache can handle
69 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
70 because we reserve RAT0 for global bindings. With byteaddressing enabled,
71 we should reserve another one too.=> 10 image binding for writing max.
74 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
75 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
77 so 10 for writing is enough. 176 is the max for reading according to the docs
79 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
80 writable images will consume TEX slots, VTX slots too because of linear indexing
84 struct r600_resource
*r600_compute_buffer_alloc_vram(struct r600_screen
*screen
,
87 struct pipe_resource
*buffer
= NULL
;
90 buffer
= pipe_buffer_create((struct pipe_screen
*) screen
,
91 0, PIPE_USAGE_IMMUTABLE
, size
);
93 return (struct r600_resource
*)buffer
;
97 static void evergreen_set_rat(struct r600_pipe_compute
*pipe
,
99 struct r600_resource
*bo
,
103 struct pipe_surface rat_templ
;
104 struct r600_surface
*surf
= NULL
;
105 struct r600_context
*rctx
= NULL
;
108 assert((size
& 3) == 0);
109 assert((start
& 0xFF) == 0);
113 COMPUTE_DBG(rctx
->screen
, "bind rat: %i \n", id
);
115 /* Create the RAT surface */
116 memset(&rat_templ
, 0, sizeof(rat_templ
));
117 rat_templ
.format
= PIPE_FORMAT_R32_UINT
;
118 rat_templ
.u
.tex
.level
= 0;
119 rat_templ
.u
.tex
.first_layer
= 0;
120 rat_templ
.u
.tex
.last_layer
= 0;
122 /* Add the RAT the list of color buffers */
123 pipe
->ctx
->framebuffer
.state
.cbufs
[id
] = pipe
->ctx
->b
.b
.create_surface(
124 (struct pipe_context
*)pipe
->ctx
,
125 (struct pipe_resource
*)bo
, &rat_templ
);
127 /* Update the number of color buffers */
128 pipe
->ctx
->framebuffer
.state
.nr_cbufs
=
129 MAX2(id
+ 1, pipe
->ctx
->framebuffer
.state
.nr_cbufs
);
131 /* Update the cb_target_mask
132 * XXX: I think this is a potential spot for bugs once we start doing
133 * GL interop. cb_target_mask may be modified in the 3D sections
135 pipe
->ctx
->compute_cb_target_mask
|= (0xf << (id
* 4));
137 surf
= (struct r600_surface
*)pipe
->ctx
->framebuffer
.state
.cbufs
[id
];
138 evergreen_init_color_surface_rat(rctx
, surf
);
141 static void evergreen_cs_set_vertex_buffer(struct r600_context
*rctx
,
144 struct pipe_resource
*buffer
)
146 struct r600_vertexbuf_state
*state
= &rctx
->cs_vertex_buffer_state
;
147 struct pipe_vertex_buffer
*vb
= &state
->vb
[vb_index
];
149 vb
->buffer_offset
= offset
;
150 vb
->buffer
.resource
= buffer
;
151 vb
->is_user_buffer
= false;
153 /* The vertex instructions in the compute shaders use the texture cache,
154 * so we need to invalidate it. */
155 rctx
->b
.flags
|= R600_CONTEXT_INV_VERTEX_CACHE
;
156 state
->enabled_mask
|= 1 << vb_index
;
157 state
->dirty_mask
|= 1 << vb_index
;
158 r600_mark_atom_dirty(rctx
, &state
->atom
);
161 static void evergreen_cs_set_constant_buffer(struct r600_context
*rctx
,
165 struct pipe_resource
*buffer
)
167 struct pipe_constant_buffer cb
;
168 cb
.buffer_size
= size
;
169 cb
.buffer_offset
= offset
;
171 cb
.user_buffer
= NULL
;
173 rctx
->b
.b
.set_constant_buffer(&rctx
->b
.b
, PIPE_SHADER_COMPUTE
, cb_index
, &cb
);
176 /* We need to define these R600 registers here, because we can't include
177 * evergreend.h and r600d.h.
179 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
180 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
183 static void parse_symbol_table(Elf_Data
*symbol_table_data
,
184 const GElf_Shdr
*symbol_table_header
,
185 struct ac_shader_binary
*binary
)
189 unsigned symbol_count
=
190 symbol_table_header
->sh_size
/ symbol_table_header
->sh_entsize
;
192 /* We are over allocating this list, because symbol_count gives the
193 * total number of symbols, and we will only be filling the list
194 * with offsets of global symbols. The memory savings from
195 * allocating the correct size of this list will be small, and
196 * I don't think it is worth the cost of pre-computing the number
199 binary
->global_symbol_offsets
= CALLOC(symbol_count
, sizeof(uint64_t));
201 while (gelf_getsym(symbol_table_data
, i
++, &symbol
)) {
203 if (GELF_ST_BIND(symbol
.st_info
) != STB_GLOBAL
||
204 symbol
.st_shndx
== 0 /* Undefined symbol */) {
208 binary
->global_symbol_offsets
[binary
->global_symbol_count
] =
211 /* Sort the list using bubble sort. This list will usually
213 for (i
= binary
->global_symbol_count
; i
> 0; --i
) {
214 uint64_t lhs
= binary
->global_symbol_offsets
[i
- 1];
215 uint64_t rhs
= binary
->global_symbol_offsets
[i
];
219 binary
->global_symbol_offsets
[i
] = lhs
;
220 binary
->global_symbol_offsets
[i
- 1] = rhs
;
222 ++binary
->global_symbol_count
;
227 static void parse_relocs(Elf
*elf
, Elf_Data
*relocs
, Elf_Data
*symbols
,
228 unsigned symbol_sh_link
,
229 struct ac_shader_binary
*binary
)
233 if (!relocs
|| !symbols
|| !binary
->reloc_count
) {
236 binary
->relocs
= CALLOC(binary
->reloc_count
,
237 sizeof(struct ac_shader_reloc
));
238 for (i
= 0; i
< binary
->reloc_count
; i
++) {
242 struct ac_shader_reloc
*reloc
= &binary
->relocs
[i
];
244 gelf_getrel(relocs
, i
, &rel
);
245 gelf_getsym(symbols
, GELF_R_SYM(rel
.r_info
), &symbol
);
246 symbol_name
= elf_strptr(elf
, symbol_sh_link
, symbol
.st_name
);
248 reloc
->offset
= rel
.r_offset
;
249 strncpy(reloc
->name
, symbol_name
, sizeof(reloc
->name
)-1);
250 reloc
->name
[sizeof(reloc
->name
)-1] = 0;
254 static void r600_elf_read(const char *elf_data
, unsigned elf_size
,
255 struct ac_shader_binary
*binary
)
259 Elf_Scn
*section
= NULL
;
260 Elf_Data
*symbols
= NULL
, *relocs
= NULL
;
261 size_t section_str_index
;
262 unsigned symbol_sh_link
= 0;
264 /* One of the libelf implementations
265 * (http://www.mr511.de/software/english.htm) requires calling
266 * elf_version() before elf_memory().
268 elf_version(EV_CURRENT
);
269 elf_buffer
= MALLOC(elf_size
);
270 memcpy(elf_buffer
, elf_data
, elf_size
);
272 elf
= elf_memory(elf_buffer
, elf_size
);
274 elf_getshdrstrndx(elf
, §ion_str_index
);
276 while ((section
= elf_nextscn(elf
, section
))) {
278 Elf_Data
*section_data
= NULL
;
279 GElf_Shdr section_header
;
280 if (gelf_getshdr(section
, §ion_header
) != §ion_header
) {
281 fprintf(stderr
, "Failed to read ELF section header\n");
284 name
= elf_strptr(elf
, section_str_index
, section_header
.sh_name
);
285 if (!strcmp(name
, ".text")) {
286 section_data
= elf_getdata(section
, section_data
);
287 binary
->code_size
= section_data
->d_size
;
288 binary
->code
= MALLOC(binary
->code_size
* sizeof(unsigned char));
289 memcpy(binary
->code
, section_data
->d_buf
, binary
->code_size
);
290 } else if (!strcmp(name
, ".AMDGPU.config")) {
291 section_data
= elf_getdata(section
, section_data
);
292 binary
->config_size
= section_data
->d_size
;
293 binary
->config
= MALLOC(binary
->config_size
* sizeof(unsigned char));
294 memcpy(binary
->config
, section_data
->d_buf
, binary
->config_size
);
295 } else if (!strcmp(name
, ".AMDGPU.disasm")) {
296 /* Always read disassembly if it's available. */
297 section_data
= elf_getdata(section
, section_data
);
298 binary
->disasm_string
= strndup(section_data
->d_buf
,
299 section_data
->d_size
);
300 } else if (!strncmp(name
, ".rodata", 7)) {
301 section_data
= elf_getdata(section
, section_data
);
302 binary
->rodata_size
= section_data
->d_size
;
303 binary
->rodata
= MALLOC(binary
->rodata_size
* sizeof(unsigned char));
304 memcpy(binary
->rodata
, section_data
->d_buf
, binary
->rodata_size
);
305 } else if (!strncmp(name
, ".symtab", 7)) {
306 symbols
= elf_getdata(section
, section_data
);
307 symbol_sh_link
= section_header
.sh_link
;
308 parse_symbol_table(symbols
, §ion_header
, binary
);
309 } else if (!strcmp(name
, ".rel.text")) {
310 relocs
= elf_getdata(section
, section_data
);
311 binary
->reloc_count
= section_header
.sh_size
/
312 section_header
.sh_entsize
;
316 parse_relocs(elf
, relocs
, symbols
, symbol_sh_link
, binary
);
323 /* Cache the config size per symbol */
324 if (binary
->global_symbol_count
) {
325 binary
->config_size_per_symbol
=
326 binary
->config_size
/ binary
->global_symbol_count
;
328 binary
->global_symbol_count
= 1;
329 binary
->config_size_per_symbol
= binary
->config_size
;
333 static const unsigned char *r600_shader_binary_config_start(
334 const struct ac_shader_binary
*binary
,
335 uint64_t symbol_offset
)
338 for (i
= 0; i
< binary
->global_symbol_count
; ++i
) {
339 if (binary
->global_symbol_offsets
[i
] == symbol_offset
) {
340 unsigned offset
= i
* binary
->config_size_per_symbol
;
341 return binary
->config
+ offset
;
344 return binary
->config
;
347 static void r600_shader_binary_read_config(const struct ac_shader_binary
*binary
,
348 struct r600_bytecode
*bc
,
349 uint64_t symbol_offset
,
353 const unsigned char *config
=
354 r600_shader_binary_config_start(binary
, symbol_offset
);
356 for (i
= 0; i
< binary
->config_size_per_symbol
; i
+= 8) {
358 util_le32_to_cpu(*(uint32_t*)(config
+ i
));
360 util_le32_to_cpu(*(uint32_t*)(config
+ i
+ 4));
363 case R_028850_SQ_PGM_RESOURCES_PS
:
364 case R_028868_SQ_PGM_RESOURCES_VS
:
365 /* Evergreen / Northern Islands */
366 case R_028844_SQ_PGM_RESOURCES_PS
:
367 case R_028860_SQ_PGM_RESOURCES_VS
:
368 case R_0288D4_SQ_PGM_RESOURCES_LS
:
369 bc
->ngpr
= MAX2(bc
->ngpr
, G_028844_NUM_GPRS(value
));
370 bc
->nstack
= MAX2(bc
->nstack
, G_028844_STACK_SIZE(value
));
372 case R_02880C_DB_SHADER_CONTROL
:
373 *use_kill
= G_02880C_KILL_ENABLE(value
);
375 case R_0288E8_SQ_LDS_ALLOC
:
382 static unsigned r600_create_shader(struct r600_bytecode
*bc
,
383 const struct ac_shader_binary
*binary
,
387 assert(binary
->code_size
% 4 == 0);
388 bc
->bytecode
= CALLOC(1, binary
->code_size
);
389 memcpy(bc
->bytecode
, binary
->code
, binary
->code_size
);
390 bc
->ndw
= binary
->code_size
/ 4;
392 r600_shader_binary_read_config(binary
, bc
, 0, use_kill
);
398 static void r600_destroy_shader(struct r600_bytecode
*bc
)
403 static void *evergreen_create_compute_state(struct pipe_context
*ctx
,
404 const struct pipe_compute_state
*cso
)
406 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
407 struct r600_pipe_compute
*shader
= CALLOC_STRUCT(r600_pipe_compute
);
409 const struct pipe_llvm_program_header
*header
;
414 COMPUTE_DBG(rctx
->screen
, "*** evergreen_create_compute_state\n");
416 code
= cso
->prog
+ sizeof(struct pipe_llvm_program_header
);
417 radeon_shader_binary_init(&shader
->binary
);
418 r600_elf_read(code
, header
->num_bytes
, &shader
->binary
);
419 r600_create_shader(&shader
->bc
, &shader
->binary
, &use_kill
);
421 /* Upload code + ROdata */
422 shader
->code_bo
= r600_compute_buffer_alloc_vram(rctx
->screen
,
424 p
= r600_buffer_map_sync_with_rings(&rctx
->b
, shader
->code_bo
, PIPE_TRANSFER_WRITE
);
425 //TODO: use util_memcpy_cpu_to_le32 ?
426 memcpy(p
, shader
->bc
.bytecode
, shader
->bc
.ndw
* 4);
427 rctx
->b
.ws
->buffer_unmap(shader
->code_bo
->buf
);
431 shader
->local_size
= cso
->req_local_mem
;
432 shader
->private_size
= cso
->req_private_mem
;
433 shader
->input_size
= cso
->req_input_mem
;
438 static void evergreen_delete_compute_state(struct pipe_context
*ctx
, void *state
)
440 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
441 struct r600_pipe_compute
*shader
= state
;
443 COMPUTE_DBG(rctx
->screen
, "*** evergreen_delete_compute_state\n");
449 radeon_shader_binary_clean(&shader
->binary
);
451 r600_destroy_shader(&shader
->bc
);
453 /* TODO destroy shader->code_bo, shader->const_bo
454 * we'll need something like r600_buffer_free */
458 static void evergreen_bind_compute_state(struct pipe_context
*ctx
, void *state
)
460 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
462 COMPUTE_DBG(rctx
->screen
, "*** evergreen_bind_compute_state\n");
464 rctx
->cs_shader_state
.shader
= (struct r600_pipe_compute
*)state
;
467 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
468 * kernel parameters there are implicit parameters that need to be stored
469 * in the vertex buffer as well. Here is how these parameters are organized in
472 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
473 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
474 * DWORDS 6-8: Number of work items within each work group in each dimension
476 * DWORDS 9+ : Kernel parameters
478 static void evergreen_compute_upload_input(struct pipe_context
*ctx
,
479 const struct pipe_grid_info
*info
)
481 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
482 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
484 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
487 unsigned input_size
= shader
->input_size
+ 36;
488 uint32_t *num_work_groups_start
;
489 uint32_t *global_size_start
;
490 uint32_t *local_size_start
;
491 uint32_t *kernel_parameters_start
;
493 struct pipe_transfer
*transfer
= NULL
;
495 if (shader
->input_size
== 0) {
499 if (!shader
->kernel_param
) {
500 /* Add space for the grid dimensions */
501 shader
->kernel_param
= (struct r600_resource
*)
502 pipe_buffer_create(ctx
->screen
, 0,
503 PIPE_USAGE_IMMUTABLE
, input_size
);
506 u_box_1d(0, input_size
, &box
);
507 num_work_groups_start
= ctx
->transfer_map(ctx
,
508 (struct pipe_resource
*)shader
->kernel_param
,
509 0, PIPE_TRANSFER_WRITE
| PIPE_TRANSFER_DISCARD_RANGE
,
511 global_size_start
= num_work_groups_start
+ (3 * (sizeof(uint
) /4));
512 local_size_start
= global_size_start
+ (3 * (sizeof(uint
)) / 4);
513 kernel_parameters_start
= local_size_start
+ (3 * (sizeof(uint
)) / 4);
515 /* Copy the work group size */
516 memcpy(num_work_groups_start
, info
->grid
, 3 * sizeof(uint
));
518 /* Copy the global size */
519 for (i
= 0; i
< 3; i
++) {
520 global_size_start
[i
] = info
->grid
[i
] * info
->block
[i
];
523 /* Copy the local dimensions */
524 memcpy(local_size_start
, info
->block
, 3 * sizeof(uint
));
526 /* Copy the kernel inputs */
527 memcpy(kernel_parameters_start
, info
->input
, shader
->input_size
);
529 for (i
= 0; i
< (input_size
/ 4); i
++) {
530 COMPUTE_DBG(rctx
->screen
, "input %i : %u\n", i
,
531 ((unsigned*)num_work_groups_start
)[i
]);
534 ctx
->transfer_unmap(ctx
, transfer
);
536 /* ID=0 and ID=3 are reserved for the parameters.
537 * LLVM will preferably use ID=0, but it does not work for dynamic
539 evergreen_cs_set_vertex_buffer(rctx
, 3, 0,
540 (struct pipe_resource
*)shader
->kernel_param
);
541 evergreen_cs_set_constant_buffer(rctx
, 0, 0, input_size
,
542 (struct pipe_resource
*)shader
->kernel_param
);
545 static void evergreen_emit_dispatch(struct r600_context
*rctx
,
546 const struct pipe_grid_info
*info
)
549 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
550 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
552 unsigned num_pipes
= rctx
->screen
->b
.info
.r600_max_quad_pipes
;
553 unsigned wave_divisor
= (16 * num_pipes
);
556 unsigned lds_size
= shader
->local_size
/ 4 +
560 /* Calculate group_size/grid_size */
561 for (i
= 0; i
< 3; i
++) {
562 group_size
*= info
->block
[i
];
565 for (i
= 0; i
< 3; i
++) {
566 grid_size
*= info
->grid
[i
];
569 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
570 num_waves
= (info
->block
[0] * info
->block
[1] * info
->block
[2] +
571 wave_divisor
- 1) / wave_divisor
;
573 COMPUTE_DBG(rctx
->screen
, "Using %u pipes, "
574 "%u wavefronts per thread block, "
575 "allocating %u dwords lds.\n",
576 num_pipes
, num_waves
, lds_size
);
578 radeon_set_config_reg(cs
, R_008970_VGT_NUM_INDICES
, group_size
);
580 radeon_set_config_reg_seq(cs
, R_00899C_VGT_COMPUTE_START_X
, 3);
581 radeon_emit(cs
, 0); /* R_00899C_VGT_COMPUTE_START_X */
582 radeon_emit(cs
, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
583 radeon_emit(cs
, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
585 radeon_set_config_reg(cs
, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE
,
588 radeon_compute_set_context_reg_seq(cs
, R_0286EC_SPI_COMPUTE_NUM_THREAD_X
, 3);
589 radeon_emit(cs
, info
->block
[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
590 radeon_emit(cs
, info
->block
[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
591 radeon_emit(cs
, info
->block
[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
593 if (rctx
->b
.chip_class
< CAYMAN
) {
594 assert(lds_size
<= 8192);
596 /* Cayman appears to have a slightly smaller limit, see the
597 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
598 assert(lds_size
<= 8160);
601 radeon_compute_set_context_reg(cs
, R_0288E8_SQ_LDS_ALLOC
,
602 lds_size
| (num_waves
<< 14));
604 /* Dispatch packet */
605 radeon_emit(cs
, PKT3C(PKT3_DISPATCH_DIRECT
, 3, 0));
606 radeon_emit(cs
, info
->grid
[0]);
607 radeon_emit(cs
, info
->grid
[1]);
608 radeon_emit(cs
, info
->grid
[2]);
609 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
616 static void compute_emit_cs(struct r600_context
*rctx
,
617 const struct pipe_grid_info
*info
)
619 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
622 /* make sure that the gfx ring is only one active */
623 if (radeon_emitted(rctx
->b
.dma
.cs
, 0)) {
624 rctx
->b
.dma
.flush(rctx
, RADEON_FLUSH_ASYNC
, NULL
);
627 /* Initialize all the compute-related registers.
629 * See evergreen_init_atom_start_compute_cs() in this file for the list
630 * of registers initialized by the start_compute_cs_cmd atom.
632 r600_emit_command_buffer(cs
, &rctx
->start_compute_cs_cmd
);
634 /* emit config state */
635 if (rctx
->b
.chip_class
== EVERGREEN
)
636 r600_emit_atom(rctx
, &rctx
->config_state
.atom
);
638 rctx
->b
.flags
|= R600_CONTEXT_WAIT_3D_IDLE
| R600_CONTEXT_FLUSH_AND_INV
;
639 r600_flush_emit(rctx
);
641 /* Emit colorbuffers. */
642 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
643 for (i
= 0; i
< 8 && i
< rctx
->framebuffer
.state
.nr_cbufs
; i
++) {
644 struct r600_surface
*cb
= (struct r600_surface
*)rctx
->framebuffer
.state
.cbufs
[i
];
645 unsigned reloc
= radeon_add_to_buffer_list(&rctx
->b
, &rctx
->b
.gfx
,
646 (struct r600_resource
*)cb
->base
.texture
,
647 RADEON_USAGE_READWRITE
,
648 RADEON_PRIO_SHADER_RW_BUFFER
);
650 radeon_compute_set_context_reg_seq(cs
, R_028C60_CB_COLOR0_BASE
+ i
* 0x3C, 7);
651 radeon_emit(cs
, cb
->cb_color_base
); /* R_028C60_CB_COLOR0_BASE */
652 radeon_emit(cs
, cb
->cb_color_pitch
); /* R_028C64_CB_COLOR0_PITCH */
653 radeon_emit(cs
, cb
->cb_color_slice
); /* R_028C68_CB_COLOR0_SLICE */
654 radeon_emit(cs
, cb
->cb_color_view
); /* R_028C6C_CB_COLOR0_VIEW */
655 radeon_emit(cs
, cb
->cb_color_info
); /* R_028C70_CB_COLOR0_INFO */
656 radeon_emit(cs
, cb
->cb_color_attrib
); /* R_028C74_CB_COLOR0_ATTRIB */
657 radeon_emit(cs
, cb
->cb_color_dim
); /* R_028C78_CB_COLOR0_DIM */
659 radeon_emit(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
660 radeon_emit(cs
, reloc
);
662 radeon_emit(cs
, PKT3(PKT3_NOP
, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
663 radeon_emit(cs
, reloc
);
666 radeon_compute_set_context_reg(cs
, R_028C70_CB_COLOR0_INFO
+ i
* 0x3C,
667 S_028C70_FORMAT(V_028C70_COLOR_INVALID
));
669 radeon_compute_set_context_reg(cs
, R_028E50_CB_COLOR8_INFO
+ (i
- 8) * 0x1C,
670 S_028C70_FORMAT(V_028C70_COLOR_INVALID
));
672 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
673 radeon_compute_set_context_reg(cs
, R_028238_CB_TARGET_MASK
,
674 rctx
->compute_cb_target_mask
);
677 /* Emit vertex buffer state */
678 rctx
->cs_vertex_buffer_state
.atom
.num_dw
= 12 * util_bitcount(rctx
->cs_vertex_buffer_state
.dirty_mask
);
679 r600_emit_atom(rctx
, &rctx
->cs_vertex_buffer_state
.atom
);
681 /* Emit constant buffer state */
682 r600_emit_atom(rctx
, &rctx
->constbuf_state
[PIPE_SHADER_COMPUTE
].atom
);
684 /* Emit sampler state */
685 r600_emit_atom(rctx
, &rctx
->samplers
[PIPE_SHADER_COMPUTE
].states
.atom
);
687 /* Emit sampler view (texture resource) state */
688 r600_emit_atom(rctx
, &rctx
->samplers
[PIPE_SHADER_COMPUTE
].views
.atom
);
690 /* Emit compute shader state */
691 r600_emit_atom(rctx
, &rctx
->cs_shader_state
.atom
);
693 /* Emit dispatch state and dispatch packet */
694 evergreen_emit_dispatch(rctx
, info
);
696 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
698 rctx
->b
.flags
|= R600_CONTEXT_INV_CONST_CACHE
|
699 R600_CONTEXT_INV_VERTEX_CACHE
|
700 R600_CONTEXT_INV_TEX_CACHE
;
701 r600_flush_emit(rctx
);
704 if (rctx
->b
.chip_class
>= CAYMAN
) {
705 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
706 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH
) | EVENT_INDEX(4));
707 /* DEALLOC_STATE prevents the GPU from hanging when a
708 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
709 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
711 radeon_emit(cs
, PKT3C(PKT3_DEALLOC_STATE
, 0, 0));
716 COMPUTE_DBG(rctx
->screen
, "cdw: %i\n", cs
->cdw
);
717 for (i
= 0; i
< cs
->cdw
; i
++) {
718 COMPUTE_DBG(rctx
->screen
, "%4i : 0x%08X\n", i
, cs
->buf
[i
]);
726 * Emit function for r600_cs_shader_state atom
728 void evergreen_emit_cs_shader(struct r600_context
*rctx
,
729 struct r600_atom
*atom
)
731 struct r600_cs_shader_state
*state
=
732 (struct r600_cs_shader_state
*)atom
;
733 struct r600_pipe_compute
*shader
= state
->shader
;
734 struct radeon_winsys_cs
*cs
= rctx
->b
.gfx
.cs
;
736 struct r600_resource
*code_bo
;
737 unsigned ngpr
, nstack
;
739 code_bo
= shader
->code_bo
;
740 va
= shader
->code_bo
->gpu_address
+ state
->pc
;
741 ngpr
= shader
->bc
.ngpr
;
742 nstack
= shader
->bc
.nstack
;
744 radeon_compute_set_context_reg_seq(cs
, R_0288D0_SQ_PGM_START_LS
, 3);
745 radeon_emit(cs
, va
>> 8); /* R_0288D0_SQ_PGM_START_LS */
746 radeon_emit(cs
, /* R_0288D4_SQ_PGM_RESOURCES_LS */
747 S_0288D4_NUM_GPRS(ngpr
)
748 | S_0288D4_STACK_SIZE(nstack
));
749 radeon_emit(cs
, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
751 radeon_emit(cs
, PKT3C(PKT3_NOP
, 0, 0));
752 radeon_emit(cs
, radeon_add_to_buffer_list(&rctx
->b
, &rctx
->b
.gfx
,
753 code_bo
, RADEON_USAGE_READ
,
754 RADEON_PRIO_SHADER_BINARY
));
757 static void evergreen_launch_grid(struct pipe_context
*ctx
,
758 const struct pipe_grid_info
*info
)
760 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
762 struct r600_pipe_compute
*shader
= rctx
->cs_shader_state
.shader
;
765 rctx
->cs_shader_state
.pc
= info
->pc
;
766 /* Get the config information for this kernel. */
767 r600_shader_binary_read_config(&shader
->binary
, &shader
->bc
,
768 info
->pc
, &use_kill
);
771 COMPUTE_DBG(rctx
->screen
, "*** evergreen_launch_grid: pc = %u\n", info
->pc
);
774 evergreen_compute_upload_input(ctx
, info
);
775 compute_emit_cs(rctx
, info
);
778 static void evergreen_set_compute_resources(struct pipe_context
*ctx
,
779 unsigned start
, unsigned count
,
780 struct pipe_surface
**surfaces
)
782 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
783 struct r600_surface
**resources
= (struct r600_surface
**)surfaces
;
785 COMPUTE_DBG(rctx
->screen
, "*** evergreen_set_compute_resources: start = %u count = %u\n",
788 for (unsigned i
= 0; i
< count
; i
++) {
789 /* The First four vertex buffers are reserved for parameters and
791 unsigned vtx_id
= 4 + i
;
793 struct r600_resource_global
*buffer
=
794 (struct r600_resource_global
*)
795 resources
[i
]->base
.texture
;
796 if (resources
[i
]->base
.writable
) {
799 evergreen_set_rat(rctx
->cs_shader_state
.shader
, i
+1,
800 (struct r600_resource
*)resources
[i
]->base
.texture
,
801 buffer
->chunk
->start_in_dw
*4,
802 resources
[i
]->base
.texture
->width0
);
805 evergreen_cs_set_vertex_buffer(rctx
, vtx_id
,
806 buffer
->chunk
->start_in_dw
* 4,
807 resources
[i
]->base
.texture
);
812 static void evergreen_set_global_binding(struct pipe_context
*ctx
,
813 unsigned first
, unsigned n
,
814 struct pipe_resource
**resources
,
817 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
818 struct compute_memory_pool
*pool
= rctx
->screen
->global_pool
;
819 struct r600_resource_global
**buffers
=
820 (struct r600_resource_global
**)resources
;
823 COMPUTE_DBG(rctx
->screen
, "*** evergreen_set_global_binding first = %u n = %u\n",
831 /* We mark these items for promotion to the pool if they
832 * aren't already there */
833 for (i
= first
; i
< first
+ n
; i
++) {
834 struct compute_memory_item
*item
= buffers
[i
]->chunk
;
836 if (!is_item_in_pool(item
))
837 buffers
[i
]->chunk
->status
|= ITEM_FOR_PROMOTING
;
840 if (compute_memory_finalize_pending(pool
, ctx
) == -1) {
845 for (i
= first
; i
< first
+ n
; i
++)
847 uint32_t buffer_offset
;
849 assert(resources
[i
]->target
== PIPE_BUFFER
);
850 assert(resources
[i
]->bind
& PIPE_BIND_GLOBAL
);
852 buffer_offset
= util_le32_to_cpu(*(handles
[i
]));
853 handle
= buffer_offset
+ buffers
[i
]->chunk
->start_in_dw
* 4;
855 *(handles
[i
]) = util_cpu_to_le32(handle
);
858 /* globals for writing */
859 evergreen_set_rat(rctx
->cs_shader_state
.shader
, 0, pool
->bo
, 0, pool
->size_in_dw
* 4);
860 /* globals for reading */
861 evergreen_cs_set_vertex_buffer(rctx
, 1, 0,
862 (struct pipe_resource
*)pool
->bo
);
864 /* constants for reading, LLVM puts them in text segment */
865 evergreen_cs_set_vertex_buffer(rctx
, 2, 0,
866 (struct pipe_resource
*)rctx
->cs_shader_state
.shader
->code_bo
);
870 * This function initializes all the compute specific registers that need to
871 * be initialized for each compute command stream. Registers that are common
872 * to both compute and 3D will be initialized at the beginning of each compute
873 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
874 * packet requires that the shader type bit be set, we must initialize all
875 * context registers needed for compute in this function. The registers
876 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
877 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
880 void evergreen_init_atom_start_compute_cs(struct r600_context
*rctx
)
882 struct r600_command_buffer
*cb
= &rctx
->start_compute_cs_cmd
;
884 int num_stack_entries
;
886 /* since all required registers are initialized in the
887 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
889 r600_init_command_buffer(cb
, 256);
890 cb
->pkt_flags
= RADEON_CP_PACKET3_COMPUTE_MODE
;
892 /* This must be first. */
893 r600_store_value(cb
, PKT3(PKT3_CONTEXT_CONTROL
, 1, 0));
894 r600_store_value(cb
, 0x80000000);
895 r600_store_value(cb
, 0x80000000);
897 /* We're setting config registers here. */
898 r600_store_value(cb
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
899 r600_store_value(cb
, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH
) | EVENT_INDEX(4));
901 switch (rctx
->b
.family
) {
905 num_stack_entries
= 256;
909 num_stack_entries
= 256;
913 num_stack_entries
= 512;
918 num_stack_entries
= 512;
922 num_stack_entries
= 256;
926 num_stack_entries
= 256;
930 num_stack_entries
= 512;
934 num_stack_entries
= 512;
938 num_stack_entries
= 256;
942 num_stack_entries
= 256;
946 /* Config Registers */
947 if (rctx
->b
.chip_class
< CAYMAN
)
948 evergreen_init_common_regs(rctx
, cb
, rctx
->b
.chip_class
, rctx
->b
.family
,
949 rctx
->screen
->b
.info
.drm_minor
);
951 cayman_init_common_regs(cb
, rctx
->b
.chip_class
, rctx
->b
.family
,
952 rctx
->screen
->b
.info
.drm_minor
);
954 /* The primitive type always needs to be POINTLIST for compute. */
955 r600_store_config_reg(cb
, R_008958_VGT_PRIMITIVE_TYPE
,
956 V_008958_DI_PT_POINTLIST
);
958 if (rctx
->b
.chip_class
< CAYMAN
) {
960 /* These registers control which simds can be used by each stage.
961 * The default for these registers is 0xffffffff, which means
962 * all simds are available for each stage. It's possible we may
963 * want to play around with these in the future, but for now
964 * the default value is fine.
966 * R_008E20_SQ_STATIC_THREAD_MGMT1
967 * R_008E24_SQ_STATIC_THREAD_MGMT2
968 * R_008E28_SQ_STATIC_THREAD_MGMT3
971 /* XXX: We may need to adjust the thread and stack resource
972 * values for 3D/compute interop */
974 r600_store_config_reg_seq(cb
, R_008C18_SQ_THREAD_RESOURCE_MGMT_1
, 5);
976 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
977 * Set the number of threads used by the PS/VS/GS/ES stage to
980 r600_store_value(cb
, 0);
982 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
983 * Set the number of threads used by the CS (aka LS) stage to
984 * the maximum number of threads and set the number of threads
985 * for the HS stage to 0. */
986 r600_store_value(cb
, S_008C1C_NUM_LS_THREADS(num_threads
));
988 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
989 * Set the Control Flow stack entries to 0 for PS/VS stages */
990 r600_store_value(cb
, 0);
992 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
993 * Set the Control Flow stack entries to 0 for GS/ES stages */
994 r600_store_value(cb
, 0);
996 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
997 * Set the Contol Flow stack entries to 0 for the HS stage, and
998 * set it to the maximum value for the CS (aka LS) stage. */
1000 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries
));
1002 /* Give the compute shader all the available LDS space.
1003 * NOTE: This only sets the maximum number of dwords that a compute
1004 * shader can allocate. When a shader is executed, we still need to
1005 * allocate the appropriate amount of LDS dwords using the
1006 * CM_R_0288E8_SQ_LDS_ALLOC register.
1008 if (rctx
->b
.chip_class
< CAYMAN
) {
1009 r600_store_config_reg(cb
, R_008E2C_SQ_LDS_RESOURCE_MGMT
,
1010 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1012 r600_store_context_reg(cb
, CM_R_0286FC_SPI_LDS_MGMT
,
1013 S_0286FC_NUM_PS_LDS(0) |
1014 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1017 /* Context Registers */
1019 if (rctx
->b
.chip_class
< CAYMAN
) {
1020 /* workaround for hw issues with dyn gpr - must set all limits
1021 * to 240 instead of 0, 0x1e == 240 / 8
1023 r600_store_context_reg(cb
, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1
,
1024 S_028838_PS_GPRS(0x1e) |
1025 S_028838_VS_GPRS(0x1e) |
1026 S_028838_GS_GPRS(0x1e) |
1027 S_028838_ES_GPRS(0x1e) |
1028 S_028838_HS_GPRS(0x1e) |
1029 S_028838_LS_GPRS(0x1e));
1032 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1033 r600_store_context_reg(cb
, R_028A40_VGT_GS_MODE
,
1034 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1036 r600_store_context_reg(cb
, R_028B54_VGT_SHADER_STAGES_EN
, 2/*CS_ON*/);
1038 r600_store_context_reg(cb
, R_0286E8_SPI_COMPUTE_INPUT_CNTL
,
1039 S_0286E8_TID_IN_GROUP_ENA(1) |
1040 S_0286E8_TGID_ENA(1) |
1041 S_0286E8_DISABLE_INDEX_PACK(1));
1043 /* The LOOP_CONST registers are an optimizations for loops that allows
1044 * you to store the initial counter, increment value, and maximum
1045 * counter value in a register so that hardware can calculate the
1046 * correct number of iterations for the loop, so that you don't need
1047 * to have the loop counter in your shader code. We don't currently use
1048 * this optimization, so we must keep track of the counter in the
1049 * shader and use a break instruction to exit loops. However, the
1050 * hardware will still uses this register to determine when to exit a
1051 * loop, so we need to initialize the counter to 0, set the increment
1052 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1053 * is the maximum value allowed. This gives us a maximum of 4096
1054 * iterations for our loops, but hopefully our break instruction will
1055 * execute before some time before the 4096th iteration.
1057 eg_store_loop_const(cb
, R_03A200_SQ_LOOP_CONST_0
+ (160 * 4), 0x1000FFF);
1060 void evergreen_init_compute_state_functions(struct r600_context
*rctx
)
1062 rctx
->b
.b
.create_compute_state
= evergreen_create_compute_state
;
1063 rctx
->b
.b
.delete_compute_state
= evergreen_delete_compute_state
;
1064 rctx
->b
.b
.bind_compute_state
= evergreen_bind_compute_state
;
1065 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1066 rctx
->b
.b
.set_compute_resources
= evergreen_set_compute_resources
;
1067 rctx
->b
.b
.set_global_binding
= evergreen_set_global_binding
;
1068 rctx
->b
.b
.launch_grid
= evergreen_launch_grid
;
1072 static void *r600_compute_global_transfer_map(struct pipe_context
*ctx
,
1073 struct pipe_resource
*resource
,
1076 const struct pipe_box
*box
,
1077 struct pipe_transfer
**ptransfer
)
1079 struct r600_context
*rctx
= (struct r600_context
*)ctx
;
1080 struct compute_memory_pool
*pool
= rctx
->screen
->global_pool
;
1081 struct r600_resource_global
* buffer
=
1082 (struct r600_resource_global
*)resource
;
1084 struct compute_memory_item
*item
= buffer
->chunk
;
1085 struct pipe_resource
*dst
= NULL
;
1086 unsigned offset
= box
->x
;
1088 if (is_item_in_pool(item
)) {
1089 compute_memory_demote_item(pool
, item
, ctx
);
1092 if (item
->real_buffer
== NULL
) {
1094 r600_compute_buffer_alloc_vram(pool
->screen
, item
->size_in_dw
* 4);
1098 dst
= (struct pipe_resource
*)item
->real_buffer
;
1100 if (usage
& PIPE_TRANSFER_READ
)
1101 buffer
->chunk
->status
|= ITEM_MAPPED_FOR_READING
;
1103 COMPUTE_DBG(rctx
->screen
, "* r600_compute_global_transfer_map()\n"
1104 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1105 "width = %u, height = %u, depth = %u)\n", level
, usage
,
1106 box
->x
, box
->y
, box
->z
, box
->width
, box
->height
,
1108 COMPUTE_DBG(rctx
->screen
, "Buffer id = %"PRIi64
" offset = "
1109 "%u (box.x)\n", item
->id
, box
->x
);
1112 assert(resource
->target
== PIPE_BUFFER
);
1113 assert(resource
->bind
& PIPE_BIND_GLOBAL
);
1114 assert(box
->x
>= 0);
1115 assert(box
->y
== 0);
1116 assert(box
->z
== 0);
1118 ///TODO: do it better, mapping is not possible if the pool is too big
1119 return pipe_buffer_map_range(ctx
, dst
,
1120 offset
, box
->width
, usage
, ptransfer
);
1123 static void r600_compute_global_transfer_unmap(struct pipe_context
*ctx
,
1124 struct pipe_transfer
*transfer
)
1126 /* struct r600_resource_global are not real resources, they just map
1127 * to an offset within the compute memory pool. The function
1128 * r600_compute_global_transfer_map() maps the memory pool
1129 * resource rather than the struct r600_resource_global passed to
1130 * it as an argument and then initalizes ptransfer->resource with
1131 * the memory pool resource (via pipe_buffer_map_range).
1132 * When transfer_unmap is called it uses the memory pool's
1133 * vtable which calls r600_buffer_transfer_map() rather than
1136 assert (!"This function should not be called");
1139 static void r600_compute_global_transfer_flush_region(struct pipe_context
*ctx
,
1140 struct pipe_transfer
*transfer
,
1141 const struct pipe_box
*box
)
1143 assert(0 && "TODO");
1146 static void r600_compute_global_buffer_destroy(struct pipe_screen
*screen
,
1147 struct pipe_resource
*res
)
1149 struct r600_resource_global
* buffer
= NULL
;
1150 struct r600_screen
* rscreen
= NULL
;
1152 assert(res
->target
== PIPE_BUFFER
);
1153 assert(res
->bind
& PIPE_BIND_GLOBAL
);
1155 buffer
= (struct r600_resource_global
*)res
;
1156 rscreen
= (struct r600_screen
*)screen
;
1158 compute_memory_free(rscreen
->global_pool
, buffer
->chunk
->id
);
1160 buffer
->chunk
= NULL
;
1164 static const struct u_resource_vtbl r600_global_buffer_vtbl
=
1166 u_default_resource_get_handle
, /* get_handle */
1167 r600_compute_global_buffer_destroy
, /* resource_destroy */
1168 r600_compute_global_transfer_map
, /* transfer_map */
1169 r600_compute_global_transfer_flush_region
,/* transfer_flush_region */
1170 r600_compute_global_transfer_unmap
, /* transfer_unmap */
1173 struct pipe_resource
*r600_compute_global_buffer_create(struct pipe_screen
*screen
,
1174 const struct pipe_resource
*templ
)
1176 struct r600_resource_global
* result
= NULL
;
1177 struct r600_screen
* rscreen
= NULL
;
1180 assert(templ
->target
== PIPE_BUFFER
);
1181 assert(templ
->bind
& PIPE_BIND_GLOBAL
);
1182 assert(templ
->array_size
== 1 || templ
->array_size
== 0);
1183 assert(templ
->depth0
== 1 || templ
->depth0
== 0);
1184 assert(templ
->height0
== 1 || templ
->height0
== 0);
1186 result
= (struct r600_resource_global
*)
1187 CALLOC(sizeof(struct r600_resource_global
), 1);
1188 rscreen
= (struct r600_screen
*)screen
;
1190 COMPUTE_DBG(rscreen
, "*** r600_compute_global_buffer_create\n");
1191 COMPUTE_DBG(rscreen
, "width = %u array_size = %u\n", templ
->width0
,
1194 result
->base
.b
.vtbl
= &r600_global_buffer_vtbl
;
1195 result
->base
.b
.b
= *templ
;
1196 result
->base
.b
.b
.screen
= screen
;
1197 pipe_reference_init(&result
->base
.b
.b
.reference
, 1);
1199 size_in_dw
= (templ
->width0
+3) / 4;
1201 result
->chunk
= compute_memory_alloc(rscreen
->global_pool
, size_in_dw
);
1203 if (result
->chunk
== NULL
)
1209 return &result
->base
.b
.b
;