From 85e745f2b4e77b0a580ea8f05f67c889910467a4 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 12 Jun 2019 09:33:06 -0700 Subject: [PATCH] panfrost: Integrate kernel names for tiler FBD These names are from the replay workaround in kbase; they begin to shine some light on the meaning of these fields. In particular, we now understand why the "tiler_meta" field has the effect it does on performance in certain scenes (controlling tile granularity). Signed-off-by: Alyssa Rosenzweig --- .../drivers/panfrost/include/panfrost-job.h | 34 +++++++++----- src/gallium/drivers/panfrost/pan_context.c | 44 ++++++++----------- .../drivers/panfrost/pandecode/decode.c | 36 ++++++++++----- 3 files changed, 66 insertions(+), 48 deletions(-) diff --git a/src/gallium/drivers/panfrost/include/panfrost-job.h b/src/gallium/drivers/panfrost/include/panfrost-job.h index fd23499a00c..401fef8fcec 100644 --- a/src/gallium/drivers/panfrost/include/panfrost-job.h +++ b/src/gallium/drivers/panfrost/include/panfrost-job.h @@ -2,6 +2,7 @@ * © Copyright 2017-2018 Alyssa Rosenzweig * © Copyright 2017-2018 Connor Abbott * © Copyright 2017-2018 Lyude Paul + * © Copyright2019 Collabora * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -1362,16 +1363,16 @@ struct mali_single_framebuffer { u32 zero6[7]; /* Very weird format, see generation code in trans_builder.c */ - u32 resolution_check; - + u32 tiler_resolution_check; u32 tiler_flags; - u64 unknown_address_1; /* Pointing towards... a zero buffer? */ - u64 unknown_address_2; + /* Guesses? */ + mali_ptr tiler_scratch_start; /* Pointing towards... a zero buffer? */ + mali_ptr tiler_scratch_middle; /* See mali_kbase_replay.c */ - u64 tiler_heap_free; - u64 tiler_heap_end; + mali_ptr tiler_heap_free; + mali_ptr tiler_heap_end; /* More below this, maybe */ } __attribute__((packed)); @@ -1519,18 +1520,29 @@ struct bifrost_framebuffer { u32 clear_stencil : 8; u32 unk3 : 24; // = 0x100 float clear_depth; - mali_ptr tiler_meta; - /* 0x40 */ + + + /* Tiler section begins here */ + u32 tiler_unknown; + + /* Name known from the replay workaround in the kernel. What exactly is + * flagged here is less known. We do that (tiler_flags & 0x1ff) + * specifies a mask of hierarchy weights, which explains some of the + * performance mysteries around setting it. We also known (1 << 16) + * should be set, but there's no explanation in the kernel why. */ + u32 tiler_flags; /* Note: these are guesses! */ mali_ptr tiler_scratch_start; mali_ptr tiler_scratch_middle; - /* These are not, since we see symmetry with replay jobs which name these explicitly */ - mali_ptr tiler_heap_start; + /* These are not, since we see symmetry with replay + * jobs which name these explicitly */ + + mali_ptr tiler_heap_start; /* tiler heap_free_address */ mali_ptr tiler_heap_end; - u64 zero9, zero10, zero11, zero12; + u32 tiler_weights[8]; /* optional: struct bifrost_fb_extra extra */ /* struct bifrost_render_target rts[] */ diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index 2f7ab8d5316..2a530872c02 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -107,7 +107,7 @@ panfrost_set_framebuffer_resolution(struct mali_single_framebuffer *fb, int w, i * The formula itself was discovered mostly by manual bruteforce and * aggressive algebraic simplification. */ - fb->resolution_check = ((w + h) / 3) << 4; + fb->tiler_resolution_check = ((w + h) / 3) << 4; } struct mali_single_framebuffer @@ -118,8 +118,8 @@ panfrost_emit_sfbd(struct panfrost_context *ctx) .format = 0x30000000, .clear_flags = 0x1000, .unknown_address_0 = ctx->scratchpad.gpu, - .unknown_address_1 = ctx->misc_0.gpu, - .unknown_address_2 = ctx->misc_0.gpu + 40960, + .tiler_scratch_start = ctx->misc_0.gpu, + .tiler_scratch_middle = ctx->misc_0.gpu + 40960, .tiler_flags = 0xf0, .tiler_heap_free = ctx->tiler_heap.gpu, .tiler_heap_end = ctx->tiler_heap.gpu + ctx->tiler_heap.size, @@ -134,28 +134,22 @@ struct bifrost_framebuffer panfrost_emit_mfbd(struct panfrost_context *ctx) { struct bifrost_framebuffer framebuffer = { - /* It is not yet clear what tiler_meta means or how it's - * calculated, but we can tell the lower 32-bits are a - * (monotonically increasing?) function of tile count and - * geometry complexity; I suspect it defines a memory size of - * some kind? for the tiler. It's really unclear at the - * moment... but to add to the confusion, the hardware is happy - * enough to accept a zero in this field, so we don't even have - * to worry about it right now. - * - * The byte (just after the 32-bit mark) is much more - * interesting. The higher nibble I've only ever seen as 0xF, - * but the lower one I've seen as 0x0 or 0xF, and it's not - * obvious what the difference is. But what -is- obvious is - * that when the lower nibble is zero, performance is severely - * degraded compared to when the lower nibble is set. - * Evidently, that nibble enables some sort of fast path, - * perhaps relating to caching or tile flush? Regardless, at - * this point there's no clear reason not to set it, aside from - * substantially increased memory requirements (of the misc_0 - * buffer) */ - - .tiler_meta = ((uint64_t) 0xff << 32) | 0x0, + /* It is not yet clear what this means or how it's + * calculated, but we can tell it is a (monotonically + * increasing?) function of tile count and geometry complexity; + * I suspect it defines a memory size of some kind? for the + * tiler. It's really unclear at the moment... but to add to + * the confusion, the hardware is happy enough to accept a zero + * in this field, so we don't even have to worry about it right + * now. */ + + .tiler_unknown = 0x0, + + /* The lower 0xff controls the hierarchy mask. Set more bits + * on for more tile granularity (which can be a performance win + * on some scenes, at memory bandwidth costs). For now, be lazy + * and enable everything. This might be a terrible idea. */ + .tiler_flags = 0xff, .width1 = MALI_POSITIVE(ctx->pipe_framebuffer.width), .height1 = MALI_POSITIVE(ctx->pipe_framebuffer.height), diff --git a/src/gallium/drivers/panfrost/pandecode/decode.c b/src/gallium/drivers/panfrost/pandecode/decode.c index 05b4ed3134a..04de638a904 100644 --- a/src/gallium/drivers/panfrost/pandecode/decode.c +++ b/src/gallium/drivers/panfrost/pandecode/decode.c @@ -463,10 +463,10 @@ pandecode_replay_sfbd(uint64_t gpu_va, int job_no) } MEMORY_PROP(s, unknown_address_0); - MEMORY_PROP(s, unknown_address_1); - MEMORY_PROP(s, unknown_address_2); + MEMORY_PROP(s, tiler_scratch_start); + MEMORY_PROP(s, tiler_scratch_middle); - pandecode_prop("resolution_check = 0x%" PRIx32, s->resolution_check); + pandecode_prop("tiler_resolution_check = 0x%" PRIx32, s->tiler_resolution_check); pandecode_prop("tiler_flags = 0x%" PRIx32, s->tiler_flags); MEMORY_PROP(s, tiler_heap_free); @@ -640,12 +640,12 @@ pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets) if (fb->sample_locations) pandecode_prop("sample_locations = sample_locations_%d", job_no); - /* Assume that unknown1 and tiler_meta were emitted in the last job for + /* Assume that unknown1 was emitted in the last job for * now */ - /*pandecode_prop("unknown1 = unknown1_%d_p", job_no - 1); - pandecode_prop("tiler_meta = tiler_meta_%d_p", job_no - 1);*/ MEMORY_PROP(fb, unknown1); - MEMORY_PROP(fb, tiler_meta); + + pandecode_prop("tiler_unknown = 0x%x", fb->tiler_unknown); + pandecode_prop("tiler_flags = 0x%x", fb->tiler_flags); pandecode_prop("width1 = MALI_POSITIVE(%d)", fb->width1 + 1); pandecode_prop("height1 = MALI_POSITIVE(%d)", fb->height1 + 1); @@ -668,14 +668,26 @@ pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets) MEMORY_PROP(fb, tiler_heap_start); MEMORY_PROP(fb, tiler_heap_end); - if (fb->zero3 || fb->zero4 || fb->zero9 || fb->zero10 || fb->zero11 || fb->zero12) { + if (fb->zero3 || fb->zero4) { pandecode_msg("framebuffer zeros tripped\n"); pandecode_prop("zero3 = 0x%" PRIx32, fb->zero3); pandecode_prop("zero4 = 0x%" PRIx32, fb->zero4); - pandecode_prop("zero9 = 0x%" PRIx64, fb->zero9); - pandecode_prop("zero10 = 0x%" PRIx64, fb->zero10); - pandecode_prop("zero11 = 0x%" PRIx64, fb->zero11); - pandecode_prop("zero12 = 0x%" PRIx64, fb->zero12); + } + + bool nonzero_weights = false; + + for (unsigned w = 0; w < ARRAY_SIZE(fb->tiler_weights); ++w) { + nonzero_weights |= fb->tiler_weights[w] != 0x0; + } + + if (nonzero_weights) { + pandecode_log(".tiler_weights = {"); + + for (unsigned w = 0; w < ARRAY_SIZE(fb->tiler_weights); ++w) { + pandecode_log("%d, ", fb->tiler_weights[w]); + } + + pandecode_log("},"); } pandecode_indent--; -- 2.30.2