2 * Copyright 2015 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
27 #include "driver_ddebug/dd_util.h"
28 #include "si_compute.h"
31 #include "sid_tables.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "util/u_dump.h"
34 #include "util/u_log.h"
35 #include "util/u_memory.h"
36 #include "util/u_string.h"
38 static void si_dump_bo_list(struct si_context
*sctx
, const struct radeon_saved_cs
*saved
, FILE *f
);
40 DEBUG_GET_ONCE_OPTION(replace_shaders
, "RADEON_REPLACE_SHADERS", NULL
)
43 * Store a linearized copy of all chunks of \p cs together with the buffer
46 void si_save_cs(struct radeon_winsys
*ws
, struct radeon_cmdbuf
*cs
, struct radeon_saved_cs
*saved
,
52 /* Save the IB chunks. */
53 saved
->num_dw
= cs
->prev_dw
+ cs
->current
.cdw
;
54 saved
->ib
= MALLOC(4 * saved
->num_dw
);
59 for (i
= 0; i
< cs
->num_prev
; ++i
) {
60 memcpy(buf
, cs
->prev
[i
].buf
, cs
->prev
[i
].cdw
* 4);
61 buf
+= cs
->prev
[i
].cdw
;
63 memcpy(buf
, cs
->current
.buf
, cs
->current
.cdw
* 4);
68 /* Save the buffer list. */
69 saved
->bo_count
= ws
->cs_get_buffer_list(cs
, NULL
);
70 saved
->bo_list
= CALLOC(saved
->bo_count
, sizeof(saved
->bo_list
[0]));
71 if (!saved
->bo_list
) {
75 ws
->cs_get_buffer_list(cs
, saved
->bo_list
);
80 fprintf(stderr
, "%s: out of memory\n", __func__
);
81 memset(saved
, 0, sizeof(*saved
));
84 void si_clear_saved_cs(struct radeon_saved_cs
*saved
)
89 memset(saved
, 0, sizeof(*saved
));
92 void si_destroy_saved_cs(struct si_saved_cs
*scs
)
94 si_clear_saved_cs(&scs
->gfx
);
95 si_resource_reference(&scs
->trace_buf
, NULL
);
99 static void si_dump_shader(struct si_screen
*sscreen
, struct si_shader
*shader
, FILE *f
)
101 if (shader
->shader_log
)
102 fwrite(shader
->shader_log
, shader
->shader_log_size
, 1, f
);
104 si_shader_dump(sscreen
, shader
, NULL
, f
, false);
106 if (shader
->bo
&& sscreen
->options
.dump_shader_binary
) {
107 unsigned size
= shader
->bo
->b
.b
.width0
;
108 fprintf(f
, "BO: VA=%" PRIx64
" Size=%u\n", shader
->bo
->gpu_address
, size
);
110 const char *mapped
= sscreen
->ws
->buffer_map(
111 shader
->bo
->buf
, NULL
,
112 PIPE_TRANSFER_UNSYNCHRONIZED
| PIPE_TRANSFER_READ
| RADEON_TRANSFER_TEMPORARY
);
114 for (unsigned i
= 0; i
< size
; i
+= 4) {
115 fprintf(f
, " %4x: %08x\n", i
, *(uint32_t *)(mapped
+ i
));
118 sscreen
->ws
->buffer_unmap(shader
->bo
->buf
);
124 struct si_log_chunk_shader
{
125 /* The shader destroy code assumes a current context for unlinking of
128 * While we should be able to destroy shaders without a context, doing
129 * so would happen only very rarely and be therefore likely to fail
130 * just when you're trying to debug something. Let's just remember the
131 * current context in the chunk.
133 struct si_context
*ctx
;
134 struct si_shader
*shader
;
136 /* For keep-alive reference counts */
137 struct si_shader_selector
*sel
;
138 struct si_compute
*program
;
141 static void si_log_chunk_shader_destroy(void *data
)
143 struct si_log_chunk_shader
*chunk
= data
;
144 si_shader_selector_reference(chunk
->ctx
, &chunk
->sel
, NULL
);
145 si_compute_reference(&chunk
->program
, NULL
);
149 static void si_log_chunk_shader_print(void *data
, FILE *f
)
151 struct si_log_chunk_shader
*chunk
= data
;
152 struct si_screen
*sscreen
= chunk
->ctx
->screen
;
153 si_dump_shader(sscreen
, chunk
->shader
, f
);
156 static struct u_log_chunk_type si_log_chunk_type_shader
= {
157 .destroy
= si_log_chunk_shader_destroy
,
158 .print
= si_log_chunk_shader_print
,
161 static void si_dump_gfx_shader(struct si_context
*ctx
, const struct si_shader_ctx_state
*state
,
162 struct u_log_context
*log
)
164 struct si_shader
*current
= state
->current
;
166 if (!state
->cso
|| !current
)
169 struct si_log_chunk_shader
*chunk
= CALLOC_STRUCT(si_log_chunk_shader
);
171 chunk
->shader
= current
;
172 si_shader_selector_reference(ctx
, &chunk
->sel
, current
->selector
);
173 u_log_chunk(log
, &si_log_chunk_type_shader
, chunk
);
176 static void si_dump_compute_shader(struct si_context
*ctx
, struct u_log_context
*log
)
178 const struct si_cs_shader_state
*state
= &ctx
->cs_shader_state
;
183 struct si_log_chunk_shader
*chunk
= CALLOC_STRUCT(si_log_chunk_shader
);
185 chunk
->shader
= &state
->program
->shader
;
186 si_compute_reference(&chunk
->program
, state
->program
);
187 u_log_chunk(log
, &si_log_chunk_type_shader
, chunk
);
191 * Shader compiles can be overridden with arbitrary ELF objects by setting
192 * the environment variable RADEON_REPLACE_SHADERS=num1:filename1[;num2:filename2]
194 * TODO: key this off some hash
196 bool si_replace_shader(unsigned num
, struct si_shader_binary
*binary
)
198 const char *p
= debug_get_option_replace_shaders();
199 const char *semicolon
;
202 long filesize
, nread
;
203 bool replaced
= false;
211 i
= strtoul(p
, &endp
, 0);
215 fprintf(stderr
, "RADEON_REPLACE_SHADERS formatted badly.\n");
231 semicolon
= strchr(p
, ';');
233 p
= copy
= strndup(p
, semicolon
- p
);
235 fprintf(stderr
, "out of memory\n");
240 fprintf(stderr
, "radeonsi: replace shader %u by %s\n", num
, p
);
244 perror("radeonsi: failed to open file");
248 if (fseek(f
, 0, SEEK_END
) != 0)
255 if (fseek(f
, 0, SEEK_SET
) != 0)
258 binary
->elf_buffer
= MALLOC(filesize
);
259 if (!binary
->elf_buffer
) {
260 fprintf(stderr
, "out of memory\n");
264 nread
= fread((void *)binary
->elf_buffer
, 1, filesize
, f
);
265 if (nread
!= filesize
) {
266 FREE((void *)binary
->elf_buffer
);
267 binary
->elf_buffer
= NULL
;
271 binary
->elf_size
= nread
;
281 perror("radeonsi: reading shader");
285 /* Parsed IBs are difficult to read without colors. Use "less -R file" to
286 * read them, or use "aha -b -f file" to convert them to html.
288 #define COLOR_RESET "\033[0m"
289 #define COLOR_RED "\033[31m"
290 #define COLOR_GREEN "\033[1;32m"
291 #define COLOR_YELLOW "\033[1;33m"
292 #define COLOR_CYAN "\033[1;36m"
294 static void si_dump_mmapped_reg(struct si_context
*sctx
, FILE *f
, unsigned offset
)
296 struct radeon_winsys
*ws
= sctx
->ws
;
299 if (ws
->read_registers(ws
, offset
, 1, &value
))
300 ac_dump_reg(f
, sctx
->chip_class
, offset
, value
, ~0);
303 static void si_dump_debug_registers(struct si_context
*sctx
, FILE *f
)
305 if (!sctx
->screen
->info
.has_read_registers_query
)
308 fprintf(f
, "Memory-mapped registers:\n");
309 si_dump_mmapped_reg(sctx
, f
, R_008010_GRBM_STATUS
);
311 /* No other registers can be read on DRM < 3.1.0. */
312 if (!sctx
->screen
->info
.is_amdgpu
|| sctx
->screen
->info
.drm_minor
< 1) {
317 si_dump_mmapped_reg(sctx
, f
, R_008008_GRBM_STATUS2
);
318 si_dump_mmapped_reg(sctx
, f
, R_008014_GRBM_STATUS_SE0
);
319 si_dump_mmapped_reg(sctx
, f
, R_008018_GRBM_STATUS_SE1
);
320 si_dump_mmapped_reg(sctx
, f
, R_008038_GRBM_STATUS_SE2
);
321 si_dump_mmapped_reg(sctx
, f
, R_00803C_GRBM_STATUS_SE3
);
322 si_dump_mmapped_reg(sctx
, f
, R_00D034_SDMA0_STATUS_REG
);
323 si_dump_mmapped_reg(sctx
, f
, R_00D834_SDMA1_STATUS_REG
);
324 if (sctx
->chip_class
<= GFX8
) {
325 si_dump_mmapped_reg(sctx
, f
, R_000E50_SRBM_STATUS
);
326 si_dump_mmapped_reg(sctx
, f
, R_000E4C_SRBM_STATUS2
);
327 si_dump_mmapped_reg(sctx
, f
, R_000E54_SRBM_STATUS3
);
329 si_dump_mmapped_reg(sctx
, f
, R_008680_CP_STAT
);
330 si_dump_mmapped_reg(sctx
, f
, R_008674_CP_STALLED_STAT1
);
331 si_dump_mmapped_reg(sctx
, f
, R_008678_CP_STALLED_STAT2
);
332 si_dump_mmapped_reg(sctx
, f
, R_008670_CP_STALLED_STAT3
);
333 si_dump_mmapped_reg(sctx
, f
, R_008210_CP_CPC_STATUS
);
334 si_dump_mmapped_reg(sctx
, f
, R_008214_CP_CPC_BUSY_STAT
);
335 si_dump_mmapped_reg(sctx
, f
, R_008218_CP_CPC_STALLED_STAT1
);
336 si_dump_mmapped_reg(sctx
, f
, R_00821C_CP_CPF_STATUS
);
337 si_dump_mmapped_reg(sctx
, f
, R_008220_CP_CPF_BUSY_STAT
);
338 si_dump_mmapped_reg(sctx
, f
, R_008224_CP_CPF_STALLED_STAT1
);
342 struct si_log_chunk_cs
{
343 struct si_context
*ctx
;
344 struct si_saved_cs
*cs
;
346 unsigned gfx_begin
, gfx_end
;
347 unsigned compute_begin
, compute_end
;
350 static void si_log_chunk_type_cs_destroy(void *data
)
352 struct si_log_chunk_cs
*chunk
= data
;
353 si_saved_cs_reference(&chunk
->cs
, NULL
);
357 static void si_parse_current_ib(FILE *f
, struct radeon_cmdbuf
*cs
, unsigned begin
, unsigned end
,
358 int *last_trace_id
, unsigned trace_id_count
, const char *name
,
359 enum chip_class chip_class
)
361 unsigned orig_end
= end
;
363 assert(begin
<= end
);
365 fprintf(f
, "------------------ %s begin (dw = %u) ------------------\n", name
, begin
);
367 for (unsigned prev_idx
= 0; prev_idx
< cs
->num_prev
; ++prev_idx
) {
368 struct radeon_cmdbuf_chunk
*chunk
= &cs
->prev
[prev_idx
];
370 if (begin
< chunk
->cdw
) {
371 ac_parse_ib_chunk(f
, chunk
->buf
+ begin
, MIN2(end
, chunk
->cdw
) - begin
, last_trace_id
,
372 trace_id_count
, chip_class
, NULL
, NULL
);
375 if (end
<= chunk
->cdw
)
378 if (begin
< chunk
->cdw
)
379 fprintf(f
, "\n---------- Next %s Chunk ----------\n\n", name
);
381 begin
-= MIN2(begin
, chunk
->cdw
);
385 assert(end
<= cs
->current
.cdw
);
387 ac_parse_ib_chunk(f
, cs
->current
.buf
+ begin
, end
- begin
, last_trace_id
, trace_id_count
,
388 chip_class
, NULL
, NULL
);
390 fprintf(f
, "------------------- %s end (dw = %u) -------------------\n\n", name
, orig_end
);
393 static void si_log_chunk_type_cs_print(void *data
, FILE *f
)
395 struct si_log_chunk_cs
*chunk
= data
;
396 struct si_context
*ctx
= chunk
->ctx
;
397 struct si_saved_cs
*scs
= chunk
->cs
;
398 int last_trace_id
= -1;
399 int last_compute_trace_id
= -1;
401 /* We are expecting that the ddebug pipe has already
402 * waited for the context, so this buffer should be idle.
403 * If the GPU is hung, there is no point in waiting for it.
405 uint32_t *map
= ctx
->ws
->buffer_map(scs
->trace_buf
->buf
, NULL
,
406 PIPE_TRANSFER_UNSYNCHRONIZED
| PIPE_TRANSFER_READ
);
408 last_trace_id
= map
[0];
409 last_compute_trace_id
= map
[1];
412 if (chunk
->gfx_end
!= chunk
->gfx_begin
) {
413 if (chunk
->gfx_begin
== 0) {
414 if (ctx
->cs_preamble_state
)
415 ac_parse_ib(f
, ctx
->cs_preamble_state
->pm4
, ctx
->cs_preamble_state
->ndw
, NULL
, 0,
416 "IB2: Init config", ctx
->chip_class
, NULL
, NULL
);
418 if (ctx
->cs_preamble_gs_rings
)
419 ac_parse_ib(f
, ctx
->cs_preamble_gs_rings
->pm4
, ctx
->cs_preamble_gs_rings
->ndw
, NULL
, 0,
420 "IB2: Init GS rings", ctx
->chip_class
, NULL
, NULL
);
424 ac_parse_ib(f
, scs
->gfx
.ib
+ chunk
->gfx_begin
, chunk
->gfx_end
- chunk
->gfx_begin
,
425 &last_trace_id
, map
? 1 : 0, "IB", ctx
->chip_class
, NULL
, NULL
);
427 si_parse_current_ib(f
, ctx
->gfx_cs
, chunk
->gfx_begin
, chunk
->gfx_end
, &last_trace_id
,
428 map
? 1 : 0, "IB", ctx
->chip_class
);
432 if (chunk
->compute_end
!= chunk
->compute_begin
) {
433 assert(ctx
->prim_discard_compute_cs
);
436 ac_parse_ib(f
, scs
->compute
.ib
+ chunk
->compute_begin
,
437 chunk
->compute_end
- chunk
->compute_begin
, &last_compute_trace_id
, map
? 1 : 0,
438 "Compute IB", ctx
->chip_class
, NULL
, NULL
);
440 si_parse_current_ib(f
, ctx
->prim_discard_compute_cs
, chunk
->compute_begin
,
441 chunk
->compute_end
, &last_compute_trace_id
, map
? 1 : 0, "Compute IB",
446 if (chunk
->dump_bo_list
) {
447 fprintf(f
, "Flushing. Time: ");
448 util_dump_ns(f
, scs
->time_flush
);
450 si_dump_bo_list(ctx
, &scs
->gfx
, f
);
454 static const struct u_log_chunk_type si_log_chunk_type_cs
= {
455 .destroy
= si_log_chunk_type_cs_destroy
,
456 .print
= si_log_chunk_type_cs_print
,
459 static void si_log_cs(struct si_context
*ctx
, struct u_log_context
*log
, bool dump_bo_list
)
461 assert(ctx
->current_saved_cs
);
463 struct si_saved_cs
*scs
= ctx
->current_saved_cs
;
464 unsigned gfx_cur
= ctx
->gfx_cs
->prev_dw
+ ctx
->gfx_cs
->current
.cdw
;
465 unsigned compute_cur
= 0;
467 if (ctx
->prim_discard_compute_cs
)
469 ctx
->prim_discard_compute_cs
->prev_dw
+ ctx
->prim_discard_compute_cs
->current
.cdw
;
471 if (!dump_bo_list
&& gfx_cur
== scs
->gfx_last_dw
&& compute_cur
== scs
->compute_last_dw
)
474 struct si_log_chunk_cs
*chunk
= calloc(1, sizeof(*chunk
));
477 si_saved_cs_reference(&chunk
->cs
, scs
);
478 chunk
->dump_bo_list
= dump_bo_list
;
480 chunk
->gfx_begin
= scs
->gfx_last_dw
;
481 chunk
->gfx_end
= gfx_cur
;
482 scs
->gfx_last_dw
= gfx_cur
;
484 chunk
->compute_begin
= scs
->compute_last_dw
;
485 chunk
->compute_end
= compute_cur
;
486 scs
->compute_last_dw
= compute_cur
;
488 u_log_chunk(log
, &si_log_chunk_type_cs
, chunk
);
491 void si_auto_log_cs(void *data
, struct u_log_context
*log
)
493 struct si_context
*ctx
= (struct si_context
*)data
;
494 si_log_cs(ctx
, log
, false);
497 void si_log_hw_flush(struct si_context
*sctx
)
502 si_log_cs(sctx
, sctx
->log
, true);
504 if (&sctx
->b
== sctx
->screen
->aux_context
) {
505 /* The aux context isn't captured by the ddebug wrapper,
506 * so we dump it on a flush-by-flush basis here.
508 FILE *f
= dd_get_debug_file(false);
510 fprintf(stderr
, "radeonsi: error opening aux context dump file.\n");
512 dd_write_header(f
, &sctx
->screen
->b
, 0);
514 fprintf(f
, "Aux context dump:\n\n");
515 u_log_new_page_print(sctx
->log
, f
);
522 static const char *priority_to_string(enum radeon_bo_priority priority
)
524 #define ITEM(x) [RADEON_PRIO_##x] = #x
525 static const char *table
[64] = {
528 ITEM(SO_FILLED_SIZE
),
538 ITEM(SAMPLER_BUFFER
),
540 ITEM(SHADER_RW_BUFFER
),
541 ITEM(COMPUTE_GLOBAL
),
542 ITEM(SAMPLER_TEXTURE
),
543 ITEM(SHADER_RW_IMAGE
),
544 ITEM(SAMPLER_TEXTURE_MSAA
),
547 ITEM(COLOR_BUFFER_MSAA
),
548 ITEM(DEPTH_BUFFER_MSAA
),
552 ITEM(SCRATCH_BUFFER
),
556 assert(priority
< ARRAY_SIZE(table
));
557 return table
[priority
];
560 static int bo_list_compare_va(const struct radeon_bo_list_item
*a
,
561 const struct radeon_bo_list_item
*b
)
563 return a
->vm_address
< b
->vm_address
? -1 : a
->vm_address
> b
->vm_address
? 1 : 0;
566 static void si_dump_bo_list(struct si_context
*sctx
, const struct radeon_saved_cs
*saved
, FILE *f
)
573 /* Sort the list according to VM adddresses first. */
574 qsort(saved
->bo_list
, saved
->bo_count
, sizeof(saved
->bo_list
[0]), (void *)bo_list_compare_va
);
576 fprintf(f
, "Buffer list (in units of pages = 4kB):\n" COLOR_YELLOW
577 " Size VM start page "
578 "VM end page Usage" COLOR_RESET
"\n");
580 for (i
= 0; i
< saved
->bo_count
; i
++) {
581 /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */
582 const unsigned page_size
= sctx
->screen
->info
.gart_page_size
;
583 uint64_t va
= saved
->bo_list
[i
].vm_address
;
584 uint64_t size
= saved
->bo_list
[i
].bo_size
;
587 /* If there's unused virtual memory between 2 buffers, print it. */
589 uint64_t previous_va_end
=
590 saved
->bo_list
[i
- 1].vm_address
+ saved
->bo_list
[i
- 1].bo_size
;
592 if (va
> previous_va_end
) {
593 fprintf(f
, " %10" PRIu64
" -- hole --\n", (va
- previous_va_end
) / page_size
);
597 /* Print the buffer. */
598 fprintf(f
, " %10" PRIu64
" 0x%013" PRIX64
" 0x%013" PRIX64
" ",
599 size
/ page_size
, va
/ page_size
, (va
+ size
) / page_size
);
601 /* Print the usage. */
602 for (j
= 0; j
< 32; j
++) {
603 if (!(saved
->bo_list
[i
].priority_usage
& (1u << j
)))
606 fprintf(f
, "%s%s", !hit
? "" : ", ", priority_to_string(j
));
611 fprintf(f
, "\nNote: The holes represent memory not used by the IB.\n"
612 " Other buffers can still be allocated there.\n\n");
615 static void si_dump_framebuffer(struct si_context
*sctx
, struct u_log_context
*log
)
617 struct pipe_framebuffer_state
*state
= &sctx
->framebuffer
.state
;
618 struct si_texture
*tex
;
621 for (i
= 0; i
< state
->nr_cbufs
; i
++) {
622 if (!state
->cbufs
[i
])
625 tex
= (struct si_texture
*)state
->cbufs
[i
]->texture
;
626 u_log_printf(log
, COLOR_YELLOW
"Color buffer %i:" COLOR_RESET
"\n", i
);
627 si_print_texture_info(sctx
->screen
, tex
, log
);
628 u_log_printf(log
, "\n");
632 tex
= (struct si_texture
*)state
->zsbuf
->texture
;
633 u_log_printf(log
, COLOR_YELLOW
"Depth-stencil buffer:" COLOR_RESET
"\n");
634 si_print_texture_info(sctx
->screen
, tex
, log
);
635 u_log_printf(log
, "\n");
639 typedef unsigned (*slot_remap_func
)(unsigned);
641 struct si_log_chunk_desc_list
{
642 /** Pointer to memory map of buffer where the list is uploader */
644 /** Reference of buffer where the list is uploaded, so that gpu_list
646 struct si_resource
*buf
;
648 const char *shader_name
;
649 const char *elem_name
;
650 slot_remap_func slot_remap
;
651 enum chip_class chip_class
;
652 unsigned element_dw_size
;
653 unsigned num_elements
;
658 static void si_log_chunk_desc_list_destroy(void *data
)
660 struct si_log_chunk_desc_list
*chunk
= data
;
661 si_resource_reference(&chunk
->buf
, NULL
);
665 static void si_log_chunk_desc_list_print(void *data
, FILE *f
)
667 struct si_log_chunk_desc_list
*chunk
= data
;
668 unsigned sq_img_rsrc_word0
=
669 chunk
->chip_class
>= GFX10
? R_00A000_SQ_IMG_RSRC_WORD0
: R_008F10_SQ_IMG_RSRC_WORD0
;
671 for (unsigned i
= 0; i
< chunk
->num_elements
; i
++) {
672 unsigned cpu_dw_offset
= i
* chunk
->element_dw_size
;
673 unsigned gpu_dw_offset
= chunk
->slot_remap(i
) * chunk
->element_dw_size
;
674 const char *list_note
= chunk
->gpu_list
? "GPU list" : "CPU list";
675 uint32_t *cpu_list
= chunk
->list
+ cpu_dw_offset
;
676 uint32_t *gpu_list
= chunk
->gpu_list
? chunk
->gpu_list
+ gpu_dw_offset
: cpu_list
;
678 fprintf(f
, COLOR_GREEN
"%s%s slot %u (%s):" COLOR_RESET
"\n", chunk
->shader_name
,
679 chunk
->elem_name
, i
, list_note
);
681 switch (chunk
->element_dw_size
) {
683 for (unsigned j
= 0; j
< 4; j
++)
684 ac_dump_reg(f
, chunk
->chip_class
, R_008F00_SQ_BUF_RSRC_WORD0
+ j
* 4, gpu_list
[j
],
688 for (unsigned j
= 0; j
< 8; j
++)
689 ac_dump_reg(f
, chunk
->chip_class
, sq_img_rsrc_word0
+ j
* 4, gpu_list
[j
], 0xffffffff);
691 fprintf(f
, COLOR_CYAN
" Buffer:" COLOR_RESET
"\n");
692 for (unsigned j
= 0; j
< 4; j
++)
693 ac_dump_reg(f
, chunk
->chip_class
, R_008F00_SQ_BUF_RSRC_WORD0
+ j
* 4, gpu_list
[4 + j
],
697 for (unsigned j
= 0; j
< 8; j
++)
698 ac_dump_reg(f
, chunk
->chip_class
, sq_img_rsrc_word0
+ j
* 4, gpu_list
[j
], 0xffffffff);
700 fprintf(f
, COLOR_CYAN
" Buffer:" COLOR_RESET
"\n");
701 for (unsigned j
= 0; j
< 4; j
++)
702 ac_dump_reg(f
, chunk
->chip_class
, R_008F00_SQ_BUF_RSRC_WORD0
+ j
* 4, gpu_list
[4 + j
],
705 fprintf(f
, COLOR_CYAN
" FMASK:" COLOR_RESET
"\n");
706 for (unsigned j
= 0; j
< 8; j
++)
707 ac_dump_reg(f
, chunk
->chip_class
, sq_img_rsrc_word0
+ j
* 4, gpu_list
[8 + j
],
710 fprintf(f
, COLOR_CYAN
" Sampler state:" COLOR_RESET
"\n");
711 for (unsigned j
= 0; j
< 4; j
++)
712 ac_dump_reg(f
, chunk
->chip_class
, R_008F30_SQ_IMG_SAMP_WORD0
+ j
* 4, gpu_list
[12 + j
],
717 if (memcmp(gpu_list
, cpu_list
, chunk
->element_dw_size
* 4) != 0) {
718 fprintf(f
, COLOR_RED
"!!!!! This slot was corrupted in GPU memory !!!!!" COLOR_RESET
"\n");
725 static const struct u_log_chunk_type si_log_chunk_type_descriptor_list
= {
726 .destroy
= si_log_chunk_desc_list_destroy
,
727 .print
= si_log_chunk_desc_list_print
,
730 static void si_dump_descriptor_list(struct si_screen
*screen
, struct si_descriptors
*desc
,
731 const char *shader_name
, const char *elem_name
,
732 unsigned element_dw_size
, unsigned num_elements
,
733 slot_remap_func slot_remap
, struct u_log_context
*log
)
738 /* In some cases, the caller doesn't know how many elements are really
739 * uploaded. Reduce num_elements to fit in the range of active slots. */
740 unsigned active_range_dw_begin
= desc
->first_active_slot
* desc
->element_dw_size
;
741 unsigned active_range_dw_end
=
742 active_range_dw_begin
+ desc
->num_active_slots
* desc
->element_dw_size
;
744 while (num_elements
> 0) {
745 int i
= slot_remap(num_elements
- 1);
746 unsigned dw_begin
= i
* element_dw_size
;
747 unsigned dw_end
= dw_begin
+ element_dw_size
;
749 if (dw_begin
>= active_range_dw_begin
&& dw_end
<= active_range_dw_end
)
755 struct si_log_chunk_desc_list
*chunk
=
756 CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list
, 4 * element_dw_size
* num_elements
);
757 chunk
->shader_name
= shader_name
;
758 chunk
->elem_name
= elem_name
;
759 chunk
->element_dw_size
= element_dw_size
;
760 chunk
->num_elements
= num_elements
;
761 chunk
->slot_remap
= slot_remap
;
762 chunk
->chip_class
= screen
->info
.chip_class
;
764 si_resource_reference(&chunk
->buf
, desc
->buffer
);
765 chunk
->gpu_list
= desc
->gpu_list
;
767 for (unsigned i
= 0; i
< num_elements
; ++i
) {
768 memcpy(&chunk
->list
[i
* element_dw_size
], &desc
->list
[slot_remap(i
) * element_dw_size
],
769 4 * element_dw_size
);
772 u_log_chunk(log
, &si_log_chunk_type_descriptor_list
, chunk
);
775 static unsigned si_identity(unsigned slot
)
780 static void si_dump_descriptors(struct si_context
*sctx
, enum pipe_shader_type processor
,
781 const struct si_shader_info
*info
, struct u_log_context
*log
)
783 struct si_descriptors
*descs
=
784 &sctx
->descriptors
[SI_DESCS_FIRST_SHADER
+ processor
* SI_NUM_SHADER_DESCS
];
785 static const char *shader_name
[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
786 const char *name
= shader_name
[processor
];
787 unsigned enabled_constbuf
, enabled_shaderbuf
, enabled_samplers
;
788 unsigned enabled_images
;
791 enabled_constbuf
= info
->const_buffers_declared
;
792 enabled_shaderbuf
= info
->shader_buffers_declared
;
793 enabled_samplers
= info
->samplers_declared
;
794 enabled_images
= info
->images_declared
;
797 sctx
->const_and_shader_buffers
[processor
].enabled_mask
>> SI_NUM_SHADER_BUFFERS
;
798 enabled_shaderbuf
= sctx
->const_and_shader_buffers
[processor
].enabled_mask
&
799 u_bit_consecutive64(0, SI_NUM_SHADER_BUFFERS
);
800 enabled_shaderbuf
= 0;
801 for (int i
= 0; i
< SI_NUM_SHADER_BUFFERS
; i
++) {
803 (sctx
->const_and_shader_buffers
[processor
].enabled_mask
&
804 1llu << (SI_NUM_SHADER_BUFFERS
- i
- 1)) << i
;
806 enabled_samplers
= sctx
->samplers
[processor
].enabled_mask
;
807 enabled_images
= sctx
->images
[processor
].enabled_mask
;
810 if (processor
== PIPE_SHADER_VERTEX
&& sctx
->vb_descriptors_buffer
&&
811 sctx
->vb_descriptors_gpu_list
&& sctx
->vertex_elements
) {
812 assert(info
); /* only CS may not have an info struct */
813 struct si_descriptors desc
= {};
815 desc
.buffer
= sctx
->vb_descriptors_buffer
;
816 desc
.list
= sctx
->vb_descriptors_gpu_list
;
817 desc
.gpu_list
= sctx
->vb_descriptors_gpu_list
;
818 desc
.element_dw_size
= 4;
819 desc
.num_active_slots
= sctx
->vertex_elements
->vb_desc_list_alloc_size
/ 16;
821 si_dump_descriptor_list(sctx
->screen
, &desc
, name
, " - Vertex buffer", 4, info
->num_inputs
,
825 si_dump_descriptor_list(sctx
->screen
, &descs
[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS
], name
,
826 " - Constant buffer", 4, util_last_bit(enabled_constbuf
),
827 si_get_constbuf_slot
, log
);
828 si_dump_descriptor_list(sctx
->screen
, &descs
[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS
], name
,
829 " - Shader buffer", 4, util_last_bit(enabled_shaderbuf
),
830 si_get_shaderbuf_slot
, log
);
831 si_dump_descriptor_list(sctx
->screen
, &descs
[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES
], name
,
832 " - Sampler", 16, util_last_bit(enabled_samplers
), si_get_sampler_slot
,
834 si_dump_descriptor_list(sctx
->screen
, &descs
[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES
], name
,
835 " - Image", 8, util_last_bit(enabled_images
), si_get_image_slot
, log
);
838 static void si_dump_gfx_descriptors(struct si_context
*sctx
,
839 const struct si_shader_ctx_state
*state
,
840 struct u_log_context
*log
)
842 if (!state
->cso
|| !state
->current
)
845 si_dump_descriptors(sctx
, state
->cso
->type
, &state
->cso
->info
, log
);
848 static void si_dump_compute_descriptors(struct si_context
*sctx
, struct u_log_context
*log
)
850 if (!sctx
->cs_shader_state
.program
)
853 si_dump_descriptors(sctx
, PIPE_SHADER_COMPUTE
, NULL
, log
);
856 struct si_shader_inst
{
857 const char *text
; /* start of disassembly for this instruction */
859 unsigned size
; /* instruction size = 4 or 8 */
860 uint64_t addr
; /* instruction address */
864 * Open the given \p binary as \p rtld_binary and split the contained
865 * disassembly string into instructions and add them to the array
866 * pointed to by \p instructions, which must be sufficiently large.
868 * Labels are considered to be part of the following instruction.
870 * The caller must keep \p rtld_binary alive as long as \p instructions are
871 * used and then close it afterwards.
873 static void si_add_split_disasm(struct si_screen
*screen
, struct ac_rtld_binary
*rtld_binary
,
874 struct si_shader_binary
*binary
, uint64_t *addr
, unsigned *num
,
875 struct si_shader_inst
*instructions
,
876 gl_shader_stage stage
, unsigned wave_size
)
878 if (!ac_rtld_open(rtld_binary
, (struct ac_rtld_open_info
){
879 .info
= &screen
->info
,
880 .shader_type
= stage
,
881 .wave_size
= wave_size
,
883 .elf_ptrs
= &binary
->elf_buffer
,
884 .elf_sizes
= &binary
->elf_size
}))
889 if (!ac_rtld_get_section_by_name(rtld_binary
, ".AMDGPU.disasm", &disasm
, &nbytes
))
892 const char *end
= disasm
+ nbytes
;
893 while (disasm
< end
) {
894 const char *semicolon
= memchr(disasm
, ';', end
- disasm
);
898 struct si_shader_inst
*inst
= &instructions
[(*num
)++];
899 const char *inst_end
= memchr(semicolon
+ 1, '\n', end
- semicolon
- 1);
904 inst
->textlen
= inst_end
- disasm
;
907 /* More than 16 chars after ";" means the instruction is 8 bytes long. */
908 inst
->size
= inst_end
- semicolon
> 16 ? 8 : 4;
913 disasm
= inst_end
+ 1;
917 /* If the shader is being executed, print its asm instructions, and annotate
918 * those that are being executed right now with information about waves that
919 * execute them. This is most useful during a GPU hang.
921 static void si_print_annotated_shader(struct si_shader
*shader
, struct ac_wave_info
*waves
,
922 unsigned num_waves
, FILE *f
)
927 struct si_screen
*screen
= shader
->selector
->screen
;
928 gl_shader_stage stage
= shader
->selector
->info
.stage
;
929 uint64_t start_addr
= shader
->bo
->gpu_address
;
930 uint64_t end_addr
= start_addr
+ shader
->bo
->b
.b
.width0
;
933 /* See if any wave executes the shader. */
934 for (i
= 0; i
< num_waves
; i
++) {
935 if (start_addr
<= waves
[i
].pc
&& waves
[i
].pc
<= end_addr
)
939 return; /* the shader is not being executed */
941 /* Remember the first found wave. The waves are sorted according to PC. */
945 /* Get the list of instructions.
946 * Buffer size / 4 is the upper bound of the instruction count.
948 unsigned num_inst
= 0;
949 uint64_t inst_addr
= start_addr
;
950 unsigned wave_size
= si_get_shader_wave_size(shader
);
951 struct ac_rtld_binary rtld_binaries
[5] = {};
952 struct si_shader_inst
*instructions
=
953 calloc(shader
->bo
->b
.b
.width0
/ 4, sizeof(struct si_shader_inst
));
955 if (shader
->prolog
) {
956 si_add_split_disasm(screen
, &rtld_binaries
[0], &shader
->prolog
->binary
, &inst_addr
, &num_inst
,
957 instructions
, stage
, wave_size
);
959 if (shader
->previous_stage
) {
960 si_add_split_disasm(screen
, &rtld_binaries
[1], &shader
->previous_stage
->binary
, &inst_addr
,
961 &num_inst
, instructions
, stage
, wave_size
);
963 if (shader
->prolog2
) {
964 si_add_split_disasm(screen
, &rtld_binaries
[2], &shader
->prolog2
->binary
, &inst_addr
,
965 &num_inst
, instructions
, stage
, wave_size
);
967 si_add_split_disasm(screen
, &rtld_binaries
[3], &shader
->binary
, &inst_addr
, &num_inst
,
968 instructions
, stage
, wave_size
);
969 if (shader
->epilog
) {
970 si_add_split_disasm(screen
, &rtld_binaries
[4], &shader
->epilog
->binary
, &inst_addr
, &num_inst
,
971 instructions
, stage
, wave_size
);
974 fprintf(f
, COLOR_YELLOW
"%s - annotated disassembly:" COLOR_RESET
"\n",
975 si_get_shader_name(shader
));
977 /* Print instructions with annotations. */
978 for (i
= 0; i
< num_inst
; i
++) {
979 struct si_shader_inst
*inst
= &instructions
[i
];
981 fprintf(f
, "%.*s [PC=0x%" PRIx64
", size=%u]\n", inst
->textlen
, inst
->text
, inst
->addr
,
984 /* Print which waves execute the instruction right now. */
985 while (num_waves
&& inst
->addr
== waves
->pc
) {
987 " " COLOR_GREEN
"^ SE%u SH%u CU%u "
988 "SIMD%u WAVE%u EXEC=%016" PRIx64
" ",
989 waves
->se
, waves
->sh
, waves
->cu
, waves
->simd
, waves
->wave
, waves
->exec
);
991 if (inst
->size
== 4) {
992 fprintf(f
, "INST32=%08X" COLOR_RESET
"\n", waves
->inst_dw0
);
994 fprintf(f
, "INST64=%08X %08X" COLOR_RESET
"\n", waves
->inst_dw0
, waves
->inst_dw1
);
997 waves
->matched
= true;
1005 for (unsigned i
= 0; i
< ARRAY_SIZE(rtld_binaries
); ++i
)
1006 ac_rtld_close(&rtld_binaries
[i
]);
1009 static void si_dump_annotated_shaders(struct si_context
*sctx
, FILE *f
)
1011 struct ac_wave_info waves
[AC_MAX_WAVES_PER_CHIP
];
1012 unsigned num_waves
= ac_get_wave_info(sctx
->chip_class
, waves
);
1014 fprintf(f
, COLOR_CYAN
"The number of active waves = %u" COLOR_RESET
"\n\n", num_waves
);
1016 si_print_annotated_shader(sctx
->vs_shader
.current
, waves
, num_waves
, f
);
1017 si_print_annotated_shader(sctx
->tcs_shader
.current
, waves
, num_waves
, f
);
1018 si_print_annotated_shader(sctx
->tes_shader
.current
, waves
, num_waves
, f
);
1019 si_print_annotated_shader(sctx
->gs_shader
.current
, waves
, num_waves
, f
);
1020 si_print_annotated_shader(sctx
->ps_shader
.current
, waves
, num_waves
, f
);
1022 /* Print waves executing shaders that are not currently bound. */
1025 for (i
= 0; i
< num_waves
; i
++) {
1026 if (waves
[i
].matched
)
1030 fprintf(f
, COLOR_CYAN
"Waves not executing currently-bound shaders:" COLOR_RESET
"\n");
1034 " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016" PRIx64
" INST=%08X %08X PC=%" PRIx64
1036 waves
[i
].se
, waves
[i
].sh
, waves
[i
].cu
, waves
[i
].simd
, waves
[i
].wave
, waves
[i
].exec
,
1037 waves
[i
].inst_dw0
, waves
[i
].inst_dw1
, waves
[i
].pc
);
1043 static void si_dump_command(const char *title
, const char *command
, FILE *f
)
1047 FILE *p
= popen(command
, "r");
1051 fprintf(f
, COLOR_YELLOW
"%s: " COLOR_RESET
"\n", title
);
1052 while (fgets(line
, sizeof(line
), p
))
1058 static void si_dump_debug_state(struct pipe_context
*ctx
, FILE *f
, unsigned flags
)
1060 struct si_context
*sctx
= (struct si_context
*)ctx
;
1063 u_log_flush(sctx
->log
);
1065 if (flags
& PIPE_DUMP_DEVICE_STATUS_REGISTERS
) {
1066 si_dump_debug_registers(sctx
, f
);
1068 si_dump_annotated_shaders(sctx
, f
);
1069 si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f
);
1070 si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f
);
1074 void si_log_draw_state(struct si_context
*sctx
, struct u_log_context
*log
)
1076 struct si_shader_ctx_state
*tcs_shader
;
1081 tcs_shader
= &sctx
->tcs_shader
;
1082 if (sctx
->tes_shader
.cso
&& !sctx
->tcs_shader
.cso
)
1083 tcs_shader
= &sctx
->fixed_func_tcs_shader
;
1085 si_dump_framebuffer(sctx
, log
);
1087 si_dump_gfx_shader(sctx
, &sctx
->vs_shader
, log
);
1088 si_dump_gfx_shader(sctx
, tcs_shader
, log
);
1089 si_dump_gfx_shader(sctx
, &sctx
->tes_shader
, log
);
1090 si_dump_gfx_shader(sctx
, &sctx
->gs_shader
, log
);
1091 si_dump_gfx_shader(sctx
, &sctx
->ps_shader
, log
);
1093 si_dump_descriptor_list(sctx
->screen
, &sctx
->descriptors
[SI_DESCS_RW_BUFFERS
], "", "RW buffers",
1094 4, sctx
->descriptors
[SI_DESCS_RW_BUFFERS
].num_active_slots
, si_identity
,
1096 si_dump_gfx_descriptors(sctx
, &sctx
->vs_shader
, log
);
1097 si_dump_gfx_descriptors(sctx
, tcs_shader
, log
);
1098 si_dump_gfx_descriptors(sctx
, &sctx
->tes_shader
, log
);
1099 si_dump_gfx_descriptors(sctx
, &sctx
->gs_shader
, log
);
1100 si_dump_gfx_descriptors(sctx
, &sctx
->ps_shader
, log
);
1103 void si_log_compute_state(struct si_context
*sctx
, struct u_log_context
*log
)
1108 si_dump_compute_shader(sctx
, log
);
1109 si_dump_compute_descriptors(sctx
, log
);
1112 static void si_dump_dma(struct si_context
*sctx
, struct radeon_saved_cs
*saved
, FILE *f
)
1114 static const char ib_name
[] = "sDMA IB";
1117 si_dump_bo_list(sctx
, saved
, f
);
1119 fprintf(f
, "------------------ %s begin ------------------\n", ib_name
);
1121 for (i
= 0; i
< saved
->num_dw
; ++i
) {
1122 fprintf(f
, " %08x\n", saved
->ib
[i
]);
1125 fprintf(f
, "------------------- %s end -------------------\n", ib_name
);
1128 fprintf(f
, "SDMA Dump Done.\n");
1131 void si_check_vm_faults(struct si_context
*sctx
, struct radeon_saved_cs
*saved
, enum ring_type ring
)
1133 struct pipe_screen
*screen
= sctx
->b
.screen
;
1136 char cmd_line
[4096];
1138 if (!ac_vm_fault_occured(sctx
->chip_class
, &sctx
->dmesg_timestamp
, &addr
))
1141 f
= dd_get_debug_file(false);
1145 fprintf(f
, "VM fault report.\n\n");
1146 if (os_get_command_line(cmd_line
, sizeof(cmd_line
)))
1147 fprintf(f
, "Command: %s\n", cmd_line
);
1148 fprintf(f
, "Driver vendor: %s\n", screen
->get_vendor(screen
));
1149 fprintf(f
, "Device vendor: %s\n", screen
->get_device_vendor(screen
));
1150 fprintf(f
, "Device name: %s\n\n", screen
->get_name(screen
));
1151 fprintf(f
, "Failing VM page: 0x%08" PRIx64
"\n\n", addr
);
1153 if (sctx
->apitrace_call_number
)
1154 fprintf(f
, "Last apitrace call: %u\n\n", sctx
->apitrace_call_number
);
1158 struct u_log_context log
;
1159 u_log_context_init(&log
);
1161 si_log_draw_state(sctx
, &log
);
1162 si_log_compute_state(sctx
, &log
);
1163 si_log_cs(sctx
, &log
, true);
1165 u_log_new_page_print(&log
, f
);
1166 u_log_context_destroy(&log
);
1170 si_dump_dma(sctx
, saved
, f
);
1179 fprintf(stderr
, "Detected a VM fault, exiting...\n");
1183 void si_init_debug_functions(struct si_context
*sctx
)
1185 sctx
->b
.dump_debug_state
= si_dump_debug_state
;
1187 /* Set the initial dmesg timestamp for this context, so that
1188 * only new messages will be checked for VM faults.
1190 if (sctx
->screen
->debug_flags
& DBG(CHECK_VM
))
1191 ac_vm_fault_occured(sctx
->chip_class
, &sctx
->dmesg_timestamp
, NULL
);