From 24c8bbbb758a1b6019829c6d12db9d64dd2007e9 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 27 Sep 2017 15:27:31 -0700 Subject: [PATCH] broadcom/vc5: Use supertiles and generic tile lists. This massively reduces the size of our RCL setup. It also gets us closer to supporting multicore platforms. --- src/gallium/drivers/vc5/vc5_cl.h | 5 + src/gallium/drivers/vc5/vc5_rcl.c | 195 ++++++++++++++++--------- src/gallium/drivers/vc5/vc5_uniforms.c | 3 +- 3 files changed, 130 insertions(+), 73 deletions(-) diff --git a/src/gallium/drivers/vc5/vc5_cl.h b/src/gallium/drivers/vc5/vc5_cl.h index e935eeff536..64ccac80593 100644 --- a/src/gallium/drivers/vc5/vc5_cl.h +++ b/src/gallium/drivers/vc5/vc5_cl.h @@ -74,6 +74,11 @@ static inline uint32_t cl_offset(struct vc5_cl *cl) return (char *)cl->next - (char *)cl->base; } +static inline struct vc5_cl_reloc cl_get_address(struct vc5_cl *cl) +{ + return (struct vc5_cl_reloc){ .bo = cl->bo, .offset = cl_offset(cl) }; +} + static inline void cl_advance(struct vc5_cl_out **cl, uint32_t n) { diff --git a/src/gallium/drivers/vc5/vc5_rcl.c b/src/gallium/drivers/vc5/vc5_rcl.c index 287a35aa33e..e55a29772e0 100644 --- a/src/gallium/drivers/vc5/vc5_rcl.c +++ b/src/gallium/drivers/vc5/vc5_rcl.c @@ -26,23 +26,80 @@ #include "vc5_tiling.h" #include "broadcom/cle/v3d_packet_v33_pack.h" +static void +vc5_rcl_emit_generic_per_tile_list(struct vc5_job *job) +{ + /* Emit the generic list in our indirect state -- the rcl will just + * have pointers into it. + */ + struct vc5_cl *cl = &job->indirect; + vc5_cl_ensure_space(cl, 200, 1); + struct vc5_cl_reloc tile_list_start = cl_get_address(cl); + + const uint32_t pipe_clear_color_buffers = (PIPE_CLEAR_COLOR0 | + PIPE_CLEAR_COLOR1 | + PIPE_CLEAR_COLOR2 | + PIPE_CLEAR_COLOR3); + const uint32_t first_color_buffer_bit = (ffs(PIPE_CLEAR_COLOR0) - 1); + + uint32_t read_but_not_cleared = job->resolve & ~job->cleared; + + /* The initial reload will be queued until we get the + * tile coordinates. + */ + if (read_but_not_cleared) { + cl_emit(cl, RELOAD_TILE_COLOUR_BUFFER, load) { + load.disable_colour_buffer_load = + (~read_but_not_cleared & pipe_clear_color_buffers) >> + first_color_buffer_bit; + load.enable_z_load = + read_but_not_cleared & PIPE_CLEAR_DEPTH; + load.enable_stencil_load = + read_but_not_cleared & PIPE_CLEAR_STENCIL; + } + } + + /* Tile Coordinates triggers the reload and sets where the stores + * go. There must be one per store packet. + */ + cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords); + + cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch); + + cl_emit(cl, STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED, store) { + uint32_t color_write_enables = + job->resolve >> first_color_buffer_bit; + + store.disable_color_buffer_write = (~color_write_enables) & 0xf; + store.enable_z_write = job->resolve & PIPE_CLEAR_DEPTH; + store.enable_stencil_write = job->resolve & PIPE_CLEAR_STENCIL; + + store.disable_colour_buffers_clear_on_write = + (job->cleared & pipe_clear_color_buffers) == 0; + store.disable_z_buffer_clear_on_write = + !(job->cleared & PIPE_CLEAR_DEPTH); + store.disable_stencil_buffer_clear_on_write = + !(job->cleared & PIPE_CLEAR_STENCIL); + }; + + cl_emit(cl, RETURN_FROM_SUB_LIST, ret); + + cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) { + branch.start = tile_list_start; + branch.end = cl_get_address(cl); + } +} + +#define div_round_up(a, b) (((a) + (b) - 1) / b) + void vc5_emit_rcl(struct vc5_job *job) { - uint32_t min_x_tile = job->draw_min_x / job->tile_width; - uint32_t min_y_tile = job->draw_min_y / job->tile_height; - uint32_t max_x_tile = (job->draw_max_x - 1) / job->tile_width; - uint32_t max_y_tile = (job->draw_max_y - 1) / job->tile_height; - /* The RCL list should be empty. */ assert(!job->rcl.bo); - vc5_cl_ensure_space(&job->rcl, - 256 + - (64 * - (max_x_tile - min_x_tile + 1) * - (max_y_tile - min_y_tile + 1)), 1); - + vc5_cl_ensure_space_with_branch(&job->rcl, 200 + 256 * + cl_packet_length(SUPERTILE_COORDINATES)); job->submit.rcl_start = job->rcl.bo->offset; vc5_job_add_bo(job, job->rcl.bo); @@ -137,7 +194,45 @@ vc5_emit_rcl(struct vc5_job *job) TILE_ALLOCATION_BLOCK_SIZE_64B; } - cl_emit(&job->rcl, WAIT_ON_SEMAPHORE, sem); + uint32_t supertile_w = 1, supertile_h = 1; + + /* If doing multicore binning, we would need to initialize each core's + * tile list here. + */ + cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) { + list.address = cl_address(job->tile_alloc, 0); + } + + cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CONFIGURATION, config) { + uint32_t frame_w_in_supertiles, frame_h_in_supertiles; + const uint32_t max_supertiles = 256; + + /* Size up our supertiles until we get under the limit. */ + for (;;) { + frame_w_in_supertiles = div_round_up(job->draw_tiles_x, + supertile_w); + frame_h_in_supertiles = div_round_up(job->draw_tiles_y, + supertile_h); + if (frame_w_in_supertiles * frame_h_in_supertiles < + max_supertiles) { + break; + } + + if (supertile_w < supertile_h) + supertile_w++; + else + supertile_h++; + } + + config.total_frame_width_in_tiles = job->draw_tiles_x; + config.total_frame_height_in_tiles = job->draw_tiles_y; + + config.supertile_width_in_tiles_minus_1 = supertile_w - 1; + config.supertile_height_in_tiles_minus_1 = supertile_h - 1; + + config.total_frame_width_in_supertiles = frame_w_in_supertiles; + config.total_frame_height_in_supertiles = frame_h_in_supertiles; + } /* Start by clearing the tile buffer. */ cl_emit(&job->rcl, TILE_COORDINATES, coords) { @@ -151,68 +246,26 @@ vc5_emit_rcl(struct vc5_job *job) cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); - const uint32_t pipe_clear_color_buffers = (PIPE_CLEAR_COLOR0 | - PIPE_CLEAR_COLOR1 | - PIPE_CLEAR_COLOR2 | - PIPE_CLEAR_COLOR3); - const uint32_t first_color_buffer_bit = (ffs(PIPE_CLEAR_COLOR0) - 1); + vc5_rcl_emit_generic_per_tile_list(job); - for (int y = min_y_tile; y <= max_y_tile; y++) { - for (int x = min_x_tile; x <= max_x_tile; x++) { - uint32_t read_but_not_cleared = job->resolve & ~job->cleared; - - /* The initial reload will be queued until we get the - * tile coordinates. - */ - if (read_but_not_cleared) { - cl_emit(&job->rcl, RELOAD_TILE_COLOUR_BUFFER, load) { - load.disable_colour_buffer_load = - (~read_but_not_cleared & pipe_clear_color_buffers) >> - first_color_buffer_bit; - load.enable_z_load = - read_but_not_cleared & PIPE_CLEAR_DEPTH; - load.enable_stencil_load = - read_but_not_cleared & PIPE_CLEAR_STENCIL; - } - } - - /* Tile Coordinates triggers the reload and sets where - * the stores go. There must be one per store packet. - */ - cl_emit(&job->rcl, TILE_COORDINATES, coords) { - coords.tile_column_number = x; - coords.tile_row_number = y; - } + cl_emit(&job->rcl, WAIT_ON_SEMAPHORE, sem); - cl_emit(&job->rcl, BRANCH_TO_AUTO_CHAINED_SUB_LIST, branch) { - uint32_t bin_tile_stride = - (align(job->draw_width, - job->tile_width) / - job->tile_width); - uint32_t bin_index = - (y * bin_tile_stride + x); - branch.address = cl_address(job->tile_alloc, - 64 * bin_index); + /* XXX: Use Morton order */ + uint32_t supertile_w_in_pixels = job->tile_width * supertile_w; + uint32_t supertile_h_in_pixels = job->tile_height * supertile_h; + uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels; + uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels; + uint32_t max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels; + uint32_t max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels; + + for (int y = min_y_supertile; y <= max_y_supertile; y++) { + for (int x = min_x_supertile; x <= max_x_supertile; x++) { + cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) { + coords.column_number_in_supertiles = x; + coords.row_number_in_supertiles = y; } - - cl_emit(&job->rcl, STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED, store) { - uint32_t color_write_enables = - job->resolve >> first_color_buffer_bit; - - store.disable_color_buffer_write = (~color_write_enables) & 0xf; - store.enable_z_write = job->resolve & PIPE_CLEAR_DEPTH; - store.enable_stencil_write = job->resolve & PIPE_CLEAR_STENCIL; - - store.disable_colour_buffers_clear_on_write = - (job->cleared & pipe_clear_color_buffers) == 0; - store.disable_z_buffer_clear_on_write = - !(job->cleared & PIPE_CLEAR_DEPTH); - store.disable_stencil_buffer_clear_on_write = - !(job->cleared & PIPE_CLEAR_STENCIL); - - store.last_tile_of_frame = (x == max_x_tile && - y == max_y_tile); - }; } } + + cl_emit(&job->rcl, END_OF_RENDERING, end); } diff --git a/src/gallium/drivers/vc5/vc5_uniforms.c b/src/gallium/drivers/vc5/vc5_uniforms.c index dc444fe92a4..0c8bee51784 100644 --- a/src/gallium/drivers/vc5/vc5_uniforms.c +++ b/src/gallium/drivers/vc5/vc5_uniforms.c @@ -225,8 +225,7 @@ vc5_write_uniforms(struct vc5_context *vc5, struct vc5_compiled_shader *shader, */ vc5_cl_ensure_space(&job->indirect, MAX2(uinfo->count, 1) * 4, 4); - struct vc5_cl_reloc uniform_stream = - cl_address(job->indirect.bo, cl_offset(&job->indirect)); + struct vc5_cl_reloc uniform_stream = cl_get_address(&job->indirect); vc5_bo_reference(uniform_stream.bo); struct vc5_cl_out *uniforms = -- 2.30.2