From 2f6d47a7c8d6e69e5154de44115aab9ba35a41d2 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Fri, 16 Jul 2010 14:40:30 +0100 Subject: [PATCH] llvmpipe: use single swizzled tile Use a single swizzled tile per colorbuf (and per thread) to avoid accumulating large amounts of cached swizzled data. Now that the SSE3 code has been merged to master, the performance delta of this change is minimal, the main benefit is reduced memory usage due to no longer keeping swizzled copies of render targets. It's clear from the performance of the in-place version of this code that there is still quite a bit of time being spent swizzling & unswizzling, but it's not clear exactly how to reduce that. --- src/gallium/drivers/llvmpipe/lp_memory.c | 41 +++------- src/gallium/drivers/llvmpipe/lp_memory.h | 13 +-- src/gallium/drivers/llvmpipe/lp_rast.c | 55 +++++-------- src/gallium/drivers/llvmpipe/lp_rast_priv.h | 24 +++--- src/gallium/drivers/llvmpipe/lp_setup.c | 16 +--- src/gallium/drivers/llvmpipe/lp_texture.c | 88 +++++++++++++++++++++ src/gallium/drivers/llvmpipe/lp_texture.h | 11 +++ 7 files changed, 148 insertions(+), 100 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_memory.c b/src/gallium/drivers/llvmpipe/lp_memory.c index 61d16668eb9..0f55d4a80ae 100644 --- a/src/gallium/drivers/llvmpipe/lp_memory.c +++ b/src/gallium/drivers/llvmpipe/lp_memory.c @@ -29,32 +29,17 @@ #include "lp_limits.h" #include "lp_memory.h" - -/** 32bpp RGBA dummy tile to use in out of memory conditions */ -static PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; - -static unsigned lp_out_of_memory = 0; - - -uint8_t * -lp_get_dummy_tile(void) -{ - if (lp_out_of_memory++ < 10) { - debug_printf("llvmpipe: out of memory. Using dummy tile memory.\n"); - } - return lp_dummy_tile; -} - -uint8_t * -lp_get_dummy_tile_silent(void) -{ - return lp_dummy_tile; -} - - -boolean -lp_is_dummy_tile(void *tile) -{ - return tile == lp_dummy_tile; -} +/** + * 32bpp RGBA swizzled tiles. One for for each thread and each + * possible colorbuf. Adds up to quite a bit 8*8*64*64*4 == 1MB. + * Several schemes exist to reduce this, such as scaling back the + * number of threads or using a smaller tilesize when multiple + * colorbuffers are bound. + */ +PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4]; + + +/* A single dummy tile used in a couple of out-of-memory situations. + */ +PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; diff --git a/src/gallium/drivers/llvmpipe/lp_memory.h b/src/gallium/drivers/llvmpipe/lp_memory.h index 1d0e5ebdb69..f7418f5e087 100644 --- a/src/gallium/drivers/llvmpipe/lp_memory.h +++ b/src/gallium/drivers/llvmpipe/lp_memory.h @@ -30,16 +30,11 @@ #include "pipe/p_compiler.h" +#include "pipe/p_state.h" +#include "lp_limits.h" +extern PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4]; -extern uint8_t * -lp_get_dummy_tile(void); - -uint8_t * -lp_get_dummy_tile_silent(void); - -extern boolean -lp_is_dummy_tile(void *tile); - +extern PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; #endif /* LP_MEMORY_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index a023d2b668e..654f4ea48eb 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -67,7 +67,7 @@ lp_rast_begin( struct lp_rasterizer *rast, cbuf->level, cbuf->zslice, LP_TEX_USAGE_READ_WRITE, - LP_TEX_LAYOUT_NONE); + LP_TEX_LAYOUT_LINEAR); } if (fb->zsbuf) { @@ -271,11 +271,6 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, dst = task->depth_tile; - if (lp_is_dummy_tile(dst)) - return; - - assert(dst == lp_rast_get_depth_block_pointer(task, task->x, task->y)); - switch (block_size) { case 1: memset(dst, (uint8_t) clear_value, height * width); @@ -375,10 +370,15 @@ lp_rast_store_linear_color( struct lp_rasterizer_task *task, struct pipe_surface *cbuf = scene->fb.cbufs[buf]; const unsigned face = cbuf->face, level = cbuf->level; struct llvmpipe_resource *lpt = llvmpipe_resource(cbuf->texture); - /* this will convert the tiled data to linear if needed */ - (void) llvmpipe_get_texture_tile_linear(lpt, face, level, - LP_TEX_USAGE_READ, - task->x, task->y); + + if (!task->color_tiles[buf]) + continue; + + llvmpipe_unswizzle_cbuf_tile(lpt, + face, + level, + task->x, task->y, + task->color_tiles[buf]); } } @@ -589,6 +589,11 @@ lp_rast_tile_end(struct lp_rasterizer_task *task) (void) outline_subtiles; #endif + { + union lp_rast_cmd_arg dummy = {0}; + lp_rast_store_linear_color(task, dummy); + } + /* debug */ memset(task->color_tiles, 0, sizeof(task->color_tiles)); task->depth_tile = NULL; @@ -751,30 +756,8 @@ debug_bin( const struct cmd_bin *bin ) static boolean is_empty_bin( const struct cmd_bin *bin ) { - const struct cmd_block *head = bin->commands.head; - int i; - - if (0) - debug_bin(bin); - - /* We emit at most two load-tile commands at the start of the first - * command block. In addition we seem to emit a couple of - * set-state commands even in empty bins. - * - * As a heuristic, if a bin has more than 4 commands, consider it - * non-empty. - */ - if (head->next != NULL || - head->count > 4) { - return FALSE; - } - - for (i = 0; i < head->count; i++) - if (head->cmd[i] != lp_rast_store_linear_color) { - return FALSE; - } - - return TRUE; + if (0) debug_bin(bin); + return bin->commands.head->count == 0; } @@ -984,6 +967,10 @@ lp_rast_create( unsigned num_threads ) /* for synchronizing rasterization threads */ pipe_barrier_init( &rast->barrier, rast->num_threads ); + memset(lp_swizzled_cbuf, 0, sizeof lp_swizzled_cbuf); + + memset(lp_dummy_tile, 0, sizeof lp_dummy_tile); + return rast; } diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index 8044927c8be..b4a48cfd024 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -148,7 +148,7 @@ lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task, * the oom warning as this most likely because there is no * zsbuf. */ - return lp_get_dummy_tile_silent(); + return lp_dummy_tile; } depth = (rast->zsbuf.map + @@ -178,15 +178,14 @@ lp_rast_get_color_tile_pointer(struct lp_rasterizer_task *task, struct llvmpipe_resource *lpt; assert(cbuf); lpt = llvmpipe_resource(cbuf->texture); - task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt, - cbuf->face + cbuf->zslice, - cbuf->level, - usage, - task->x, - task->y); - if (!task->color_tiles[buf]) { - /* out of memory - use dummy tile memory */ - return lp_get_dummy_tile(); + task->color_tiles[buf] = lp_swizzled_cbuf[task->thread_index][buf]; + + if (usage != LP_TEX_USAGE_WRITE_ALL) { + llvmpipe_swizzle_cbuf_tile(lpt, + cbuf->face + cbuf->zslice, + cbuf->level, + task->x, task->y, + task->color_tiles[buf]); } } @@ -212,10 +211,7 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task, assert((y % TILE_VECTOR_HEIGHT) == 0); color = lp_rast_get_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE); - if (!color) { - /* out of memory - use dummy tile memory */ - return lp_get_dummy_tile(); - } + assert(color); px = x % TILE_SIZE; py = y % TILE_SIZE; diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 7d48ad8e741..556e571585d 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -280,20 +280,6 @@ lp_setup_flush( struct lp_setup_context *setup, LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); if (setup->scene) { - struct lp_scene *scene = lp_setup_get_current_scene(setup); - union lp_rast_cmd_arg dummy = {0}; - - if (flags & (PIPE_FLUSH_SWAPBUFFERS | - PIPE_FLUSH_FRAME)) { - /* Store colors in the linear color buffer(s). - * If we don't do this here, we'll end up converting the tiled - * data to linear in the texture_unmap() function, which will - * not be a parallel/threaded operation as here. - */ - lp_scene_bin_everywhere(scene, lp_rast_store_linear_color, dummy); - } - - if (fence) { /* if we're going to flush the setup/rasterization modules, emit * a fence. @@ -642,7 +628,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup, if (!jit_tex->data[j]) { /* out of memory - use dummy tile memory */ - jit_tex->data[j] = lp_get_dummy_tile(); + jit_tex->data[j] = lp_dummy_tile; jit_tex->width = TILE_SIZE; jit_tex->height = TILE_SIZE; jit_tex->depth = 1; diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c index d236bad69d1..bbd834519a3 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.c +++ b/src/gallium/drivers/llvmpipe/lp_texture.c @@ -1208,6 +1208,94 @@ llvmpipe_get_texture_tile(struct llvmpipe_resource *lpr, } +/** + * Get pointer to tiled data for rendering. + * \return pointer to the tiled data at the given tile position + */ +void +llvmpipe_unswizzle_cbuf_tile(struct llvmpipe_resource *lpr, + unsigned face_slice, unsigned level, + unsigned x, unsigned y, + uint8_t *tile) +{ + struct llvmpipe_texture_image *linear_img = &lpr->linear[level]; + const unsigned tx = x / TILE_SIZE, ty = y / TILE_SIZE; + uint8_t *linear_image; + + assert(x % TILE_SIZE == 0); + assert(y % TILE_SIZE == 0); + + if (!linear_img->data) { + /* allocate memory for the linear image now */ + alloc_image_data(lpr, level, LP_TEX_LAYOUT_LINEAR); + } + + /* compute address of the slice/face of the image that contains the tile */ + linear_image = llvmpipe_get_texture_image_address(lpr, face_slice, level, + LP_TEX_LAYOUT_LINEAR); + + { + uint ii = x, jj = y; + uint tile_offset = jj / TILE_SIZE + ii / TILE_SIZE; + uint byte_offset = tile_offset * TILE_SIZE * TILE_SIZE * 4; + + /* Note that lp_tiled_to_linear expects the tile parameter to + * point at the first tile in a whole-image sized array. In + * this code, we have only a single tile and have to do some + * pointer arithmetic to figure out where the "image" would have + * started. + */ + lp_tiled_to_linear(tile - byte_offset, linear_image, + x, y, TILE_SIZE, TILE_SIZE, + lpr->base.format, + lpr->row_stride[level], + 1); /* tiles per row */ + } + + llvmpipe_set_texture_tile_layout(lpr, face_slice, level, tx, ty, + LP_TEX_LAYOUT_LINEAR); +} + + +/** + * Get pointer to tiled data for rendering. + * \return pointer to the tiled data at the given tile position + */ +void +llvmpipe_swizzle_cbuf_tile(struct llvmpipe_resource *lpr, + unsigned face_slice, unsigned level, + unsigned x, unsigned y, + uint8_t *tile) +{ + uint8_t *linear_image; + + assert(x % TILE_SIZE == 0); + assert(y % TILE_SIZE == 0); + + /* compute address of the slice/face of the image that contains the tile */ + linear_image = llvmpipe_get_texture_image_address(lpr, face_slice, level, + LP_TEX_LAYOUT_LINEAR); + + if (linear_image) { + uint ii = x, jj = y; + uint tile_offset = jj / TILE_SIZE + ii / TILE_SIZE; + uint byte_offset = tile_offset * TILE_SIZE * TILE_SIZE * 4; + + /* Note that lp_linear_to_tiled expects the tile parameter to + * point at the first tile in a whole-image sized array. In + * this code, we have only a single tile and have to do some + * pointer arithmetic to figure out where the "image" would have + * started. + */ + lp_linear_to_tiled(linear_image, tile - byte_offset, + x, y, TILE_SIZE, TILE_SIZE, + lpr->base.format, + lpr->row_stride[level], + 1); /* tiles per row */ + } +} + + /** * Return size of resource in bytes */ diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h index 503b6a19a8d..4e4a65dcb40 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.h +++ b/src/gallium/drivers/llvmpipe/lp_texture.h @@ -223,6 +223,17 @@ llvmpipe_get_texture_tile(struct llvmpipe_resource *lpr, unsigned x, unsigned y); +void +llvmpipe_unswizzle_cbuf_tile(struct llvmpipe_resource *lpr, + unsigned face_slice, unsigned level, + unsigned x, unsigned y, + uint8_t *tile); + +void +llvmpipe_swizzle_cbuf_tile(struct llvmpipe_resource *lpr, + unsigned face_slice, unsigned level, + unsigned x, unsigned y, + uint8_t *tile); extern void llvmpipe_print_resources(void); -- 2.30.2